使用Python实现基于GRU网络的中文人机对话系统
发布时间:2023-12-12 07:58:23
下面是一个使用Python实现基于GRU网络的中文人机对话系统的示例,包含数据预处理、模型定义、训练和测试等步骤。
import numpy as np
import jieba
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# 数据预处理
def preprocess_data(raw_data):
# 对话数据拆分为问句和答句
sentences = raw_data.split('
')
input_texts = []
target_texts = []
# 对话数据分词
for i in range(0, len(sentences) - 1, 2):
input_text = list(jieba.cut(sentences[i]))
target_text = list(jieba.cut(sentences[i+1]))
input_texts.append(input_text)
target_texts.append(target_text)
# 构建词典
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx2word = {0: "<PAD>", 1: "<UNK>"}
num_words = 2
for input_text, target_text in zip(input_texts, target_texts):
for word in input_text + target_text:
if word not in word2idx:
word2idx[word] = num_words
idx2word[num_words] = word
num_words += 1
# 将文本转换为数字序列
input_sequences = [[word2idx[word] for word in text] for text in input_texts]
target_sequences = [[word2idx[word] for word in text] for text in target_texts]
# 对序列进行填充
max_length = max(max(len(seq) for seq in input_sequences),
max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_length, padding='post')
# 将目标序列one-hot编码
target_sequences = to_categorical(target_sequences, num_words)
return input_sequences, target_sequences, max_length, num_words, word2idx, idx2word
# 模型定义
def define_model(max_length, num_words):
model = Sequential()
model.add(Embedding(num_words, 256, input_length=max_length))
model.add(GRU(256, return_sequences=True))
model.add(Dense(num_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# 训练模型
def train_model(input_sequences, target_sequences, max_length, num_words):
model = define_model(max_length, num_words)
model.fit(input_sequences, target_sequences, epochs=30, batch_size=64, verbose=2)
return model
# 测试模型
def test_model(model, word2idx, idx2word, max_length):
while True:
input_text = input("请输入问题:")
input_text = list(jieba.cut(input_text))
input_sequence = [word2idx.get(word, word2idx["<UNK>"]) for word in input_text]
input_sequence = pad_sequences([input_sequence], maxlen=max_length, padding='post')
predicted_sequence = model.predict(input_sequence)
predicted_sequence = np.argmax(predicted_sequence, axis=-1)[0]
predicted_text = [idx2word[idx] for idx in predicted_sequence]
predicted_text = [word for word in predicted_text if word != "<PAD>"]
print("机器人回答:", ' '.join(predicted_text))
# 主程序
if __name__ == "__main__":
raw_data = """
你好
你好,我是人机对话系统,请问有什么可以帮到你的?
你可以做什么?
我可以回答一些关于人机对话系统的问题,或者与你进行简单的聊天。
再见
再见,有什么需要帮助的时候可以再来找我。
"""
input_sequences, target_sequences, max_length, num_words, word2idx, idx2word = preprocess_data(raw_data)
model = train_model(input_sequences, target_sequences, max_length, num_words)
test_model(model, word2idx, idx2word, max_length)
上述代码通过使用jieba库进行中文分词,将对话数据预处理为数字序列,然后使用GRU网络构建人机对话模型。训练模型使用带有嵌入层、GRU层和全连接层的Sequential模型,并使用categorical_crossentropy作为损失函数。然后,使用预处理后的数据训练模型,并将模型用于测试输入问题并生成机器人回答。
