使用Python实现GRU网络进行中文新闻摘要生成
发布时间:2023-12-12 07:55:15
以下是一个用Python实现GRU网络进行中文新闻摘要生成的示例代码,辅助模型为基于递归神经网络(RNN)的GRU(门控循环单元)模型。该代码能够使用训练数据集生成摘要,并通过评估指标(如BLEU)对生成结果进行评估。代码使用了PyTorch深度学习框架。
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import TranslationDataset
from torchtext.data import Field, BucketIterator
# 定义摘要生成器类
class Seq2Seq(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.encoder = nn.GRU(input_dim, hidden_dim)
self.decoder = nn.GRU(hidden_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, src, trg):
_, hidden = self.encoder(src)
output, _ = self.decoder(trg, hidden)
prediction = self.fc(output)
return prediction
# 定义训练和评估函数
def train(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for batch in iterator:
src = batch.src
trg = batch.trg
optimizer.zero_grad()
output = model(src, trg)
output = output[:-1].reshape(-1, output.shape[-1])
trg = trg[1:].reshape(-1)
loss = criterion(output, trg)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for batch in iterator:
src = batch.src
trg = batch.trg
output = model(src, trg)
output = output[:-1].reshape(-1, output.shape[-1])
trg = trg[1:].reshape(-1)
loss = criterion(output, trg)
epoch_loss += loss.item()
return epoch_loss / len(iterator)
# 设置随机种子
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
# 定义输入和目标域的torchtext字段对象
SRC = Field(tokenize='jieba', lower=True, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize='jieba', lower=True, init_token='<sos>', eos_token='<eos>')
# 加载并预处理数据集
train_data, valid_data, test_data = TranslationDataset.splits(
path='data_path',
train='train.txt',
validation='valid.txt',
test='test.txt',
exts=('.src', '.trg'),
fields=(SRC, TRG)
)
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
# 定义批次迭代器
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device
)
# 初始化模型和优化器
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HIDDEN_DIM = 256
model = Seq2Seq(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi[TRG.pad_token])
# 训练模型
N_EPOCHS = 10
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss = train(model, train_iterator, optimizer, criterion)
valid_loss = evaluate(model, valid_iterator, criterion)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'best_model.pt')
print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')
# 评估模型
model.load_state_dict(torch.load('best_model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')
# 使用模型生成摘要
def generate_summary(model, src, max_len=50):
model.eval()
tokens = ['<sos>'] + SRC.tokenize(src) + ['<eos>']
src_indexes = [SRC.vocab.stoi[token] for token in tokens]
src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
trg_indexes = [TRG.vocab.stoi['<sos>']]
with torch.no_grad():
hidden = model.encoder(src_tensor)
for _ in range(max_len):
trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
output, hidden = model.decoder(trg_tensor, hidden)
pred_token = output.argmax(2)[-1, :].item()
trg_indexes.append(pred_token)
if pred_token == TRG.vocab.stoi['<eos>']:
break
trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]
return ' '.join(trg_tokens[1:-1])
# 示例使用
example_src = '这是一条新闻的内容。'
generated_summary = generate_summary(model, example_src)
print(generated_summary)
请注意,以上代码仅是一个示例,需要自行准备并预处理训练、验证和测试数据集,并将数据路径和文件名与代码中标注的部分进行相应修改。在实际的应用中,可以根据需要自定义模型架构、优化器、损失函数以及模型评估的指标等内容进行调整和优化。希望能对你的工作有所帮助!
