使用Python实现基于GRU网络的中文人机对话系统

发布时间：2023-12-12 07:58:23

下面是一个使用Python实现基于GRU网络的中文人机对话系统的示例，包含数据预处理、模型定义、训练和测试等步骤。

import numpy as np
import jieba
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# 数据预处理
def preprocess_data(raw_data):
    # 对话数据拆分为问句和答句
    sentences = raw_data.split('
')

    input_texts = []
    target_texts = []

    # 对话数据分词
    for i in range(0, len(sentences) - 1, 2):
        input_text = list(jieba.cut(sentences[i]))
        target_text = list(jieba.cut(sentences[i+1]))
        input_texts.append(input_text)
        target_texts.append(target_text)

    # 构建词典
    word2idx = {"<PAD>": 0, "<UNK>": 1}
    idx2word = {0: "<PAD>", 1: "<UNK>"}
    num_words = 2

    for input_text, target_text in zip(input_texts, target_texts):
        for word in input_text + target_text:
            if word not in word2idx:
                word2idx[word] = num_words
                idx2word[num_words] = word
                num_words += 1

    # 将文本转换为数字序列
    input_sequences = [[word2idx[word] for word in text] for text in input_texts]
    target_sequences = [[word2idx[word] for word in text] for text in target_texts]

    # 对序列进行填充
    max_length = max(max(len(seq) for seq in input_sequences),
                     max(len(seq) for seq in target_sequences))

    input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
    target_sequences = pad_sequences(target_sequences, maxlen=max_length, padding='post')

    # 将目标序列one-hot编码
    target_sequences = to_categorical(target_sequences, num_words)

    return input_sequences, target_sequences, max_length, num_words, word2idx, idx2word

# 模型定义
def define_model(max_length, num_words):
    model = Sequential()
    model.add(Embedding(num_words, 256, input_length=max_length))
    model.add(GRU(256, return_sequences=True))
    model.add(Dense(num_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# 训练模型
def train_model(input_sequences, target_sequences, max_length, num_words):
    model = define_model(max_length, num_words)
    model.fit(input_sequences, target_sequences, epochs=30, batch_size=64, verbose=2)

    return model

# 测试模型
def test_model(model, word2idx, idx2word, max_length):
    while True:
        input_text = input("请输入问题：")
        input_text = list(jieba.cut(input_text))
        input_sequence = [word2idx.get(word, word2idx["<UNK>"]) for word in input_text]
        input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='post')

        predicted_sequence = model.predict(input_sequence)
        predicted_sequence = np.argmax(predicted_sequence, axis=-1)[0]

        predicted_text = [idx2word[idx] for idx in predicted_sequence]
        predicted_text = [word for word in predicted_text if word != "<PAD>"]

        print("机器人回答：", ' '.join(predicted_text))

# 主程序
if __name__ == "__main__":
    raw_data = """
    你好
    你好，我是人机对话系统，请问有什么可以帮到你的？
    你可以做什么？
    我可以回答一些关于人机对话系统的问题，或者与你进行简单的聊天。
    再见
    再见，有什么需要帮助的时候可以再来找我。
    """

    input_sequences, target_sequences, max_length, num_words, word2idx, idx2word = preprocess_data(raw_data)
    model = train_model(input_sequences, target_sequences, max_length, num_words)
    test_model(model, word2idx, idx2word, max_length)

上述代码通过使用jieba库进行中文分词，将对话数据预处理为数字序列，然后使用GRU网络构建人机对话模型。训练模型使用带有嵌入层、GRU层和全连接层的Sequential模型，并使用categorical_crossentropy作为损失函数。然后，使用预处理后的数据训练模型，并将模型用于测试输入问题并生成机器人回答。