基于PyTorch预训练BERT模型的中文情感分析应用实例研究
发布时间:2024-01-02 18:09:11
本文将介绍如何使用PyTorch预训练的BERT模型进行中文情感分析应用。情感分析是一种文本挖掘技术,可以自动识别和提取文本中的情感倾向,如正面、负面或中性。BERT(Bidirectional Encoder Representations from Transformers)是一种基于Transformer架构的预训练语言模型,具有强大的文本表示能力。
首先,我们需要安装所需的软件包。在终端中运行以下命令:
pip install pytorch torchvision transformers
接下来,我们需要准备数据集。这里我们使用THUCNews数据集,包含了大量的新闻文本和对应的情感标签。可以从THUCTC项目的GitHub页面下载数据集文件并解压缩。数据集中每一行包含了一个文本和对应的情感标签,标签有'0'表示负面情感,'1'表示中性情感,'2'表示正面情感。
现在,我们可以开始编写代码。首先,导入所需的库:
import torch from torch.utils.data import DataLoader, Dataset from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd import numpy as np
接下来,定义一个自定义的数据集类,用于加载和处理数据:
class SentimentDataset(Dataset):
def __init__(self, data, tokenizer, max_len):
self.data = data
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
text = str(self.data[index])
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
}
def __len__(self):
return len(self.data)
然后,加载并处理数据集:
df = pd.read_csv('path/to/dataset.csv') # 读取数据集
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42) # 划分训练集和验证集
# 初始化BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 初始化数据集
train_dataset = SentimentDataset(df_train['text'].to_numpy(), tokenizer, max_len=128)
val_dataset = SentimentDataset(df_val['text'].to_numpy(), tokenizer, max_len=128)
# 初始化数据加载器
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
接下来,定义模型结构并初始化:
class SentimentClassifier(torch.nn.Module):
def __init__(self):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-chinese')
self.dropout = torch.nn.Dropout(0.3)
self.fc = torch.nn.Linear(768, 3)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.dropout(pooled_output)
output = self.fc(output)
return output
# 初始化模型
model = SentimentClassifier()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
然后,定义训练和验证函数:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs):
best_accuracy = 0.0
for epoch in range(num_epochs):
model.train()
train_loss = 0.0
for batch in train_loader:
model.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
train_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
model.eval()
val_loss = 0.0
val_accuracy = 0.0
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predictions = torch.max(outputs, dim=1)
val_accuracy += accuracy_score(predictions.cpu().numpy(), labels.cpu().numpy())
avg_train_loss = train_loss / len(train_loader)
avg_val_loss = val_loss / len(val_loader)
avg_val_accuracy = val_accuracy / len(val_loader)
if avg_val_accuracy > best_accuracy:
best_accuracy = avg_val_accuracy
torch.save(model.state_dict(), 'path/to/model.pt')
print(f'Epoch {epoch+1}')
print(f'Training loss: {avg_train_loss:.4f}')
print(f'Validation loss: {avg_val_loss:.4f}')
print(f'Validation accuracy: {avg_val_accuracy:.4f}')
# 定义损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)
# 训练模型
num_epochs = 10
train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)
最后,加载并使用训练好的模型进行预测:
model.load_state_dict(torch.load('path/to/model.pt'))
model.eval()
test_dataset = SentimentDataset(df_test['text'].to_numpy(), tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
predictions = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
outputs = model(input_ids, attention_mask)
_, batch_predictions = torch.max(outputs, dim=1)
predictions.extend(batch_predictions.cpu().numpy())
df_test['predictions'] = predictions
以上就是使用PyTorch预训练的BERT模型进行中文情感分析的完整示例。通过构建自定义的数据集类、定义模型、训练和验证模型,并使用训练好的模型进行预测,我们可以实现对中文文本情感的自动分析和分类。
