使用torchtext在PyTorch中进行中文文本分类任务
发布时间:2023-12-31 22:44:51
在PyTorch中进行中文文本分类任务时,可以使用torchtext库来处理数据的加载、预处理和切分。torchtext是一个用于自然语言处理的PyTorch的辅助库,提供了方便的数据处理和数据加载功能。
以下是一个使用torchtext进行中文文本分类任务的示例代码,以情感分类任务为例:
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
# 定义分词函数
tokenizer = get_tokenizer('basic_english')
# 自定义的数据集类,用于加载和预处理数据
class ChineseTextClassificationDataset(torchtext.data.Dataset):
def __init__(self, data, labels, tokenizer):
self.data = data
self.labels = labels
self.tokenizer = tokenizer
def __getitem__(self, i):
tokenized_text = self.tokenizer(self.data[i])
label = self.labels[i]
return tokenized_text, label
def __len__(self):
return len(self.data)
# 定义数据集路径
data_path = 'data/ChineseSentimentAnalysis/dataset.csv'
# 读取数据集并进行划分
def read_data(data_path):
data = []
labels = []
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip().split(',')
data.append(line[0])
labels.append(int(line[1]))
return data, labels
# 加载数据集并进行分词
data, labels = read_data(data_path)
tokenized_data = [tokenizer(text) for text in data]
# 创建词汇表
vocab = build_vocab_from_iterator(tokenized_data, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
# 将文本转换为整数序列
def text_pipeline(text):
return [vocab.stoi[word] for word in text]
# 定义数据加载器和数据集
batch_size = 32
train_dataset = ChineseTextClassificationDataset(tokenized_data[:8000], labels[:8000], tokenizer)
test_dataset = ChineseTextClassificationDataset(tokenized_data[8000:], labels[8000:])
train_data_loader = torchtext.data.BucketIterator(
train_dataset, batch_size=batch_size, sort_key=lambda x: len(x.data), shuffle=True, repeat=False
)
test_data_loader = torchtext.data.BucketIterator(
test_dataset, batch_size=batch_size, sort_key=lambda x: len(x.data), shuffle=False, repeat=False
)
# 定义分类模型(以文本分类为例)
class TextClassifier(torch.nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(TextClassifier, self).__init__()
self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
self.fc = torch.nn.Linear(hidden_dim * 2, output_dim)
def forward(self, text):
embedded = self.embedding(text)
output, _ = self.rnn(embedded)
last_hidden = torch.cat((output[-1, :, :self.hidden_dim], output[0, :, self.hidden_dim:]), dim=1)
return self.fc(last_hidden)
# 实例化模型和优化器
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 2
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
# 定义训练函数
def train(model, data_loader, optimizer, criterion):
model.train()
total_loss = 0
for batch in data_loader:
text, labels = batch
text = text.transpose(0, 1)
optimizer.zero_grad()
output = model(text)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(data_loader)
# 定义评估函数
def evaluate(model, data_loader, criterion):
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch in data_loader:
text, labels = batch
text = text.transpose(0, 1)
output = model(text)
loss = criterion(output, labels)
total_loss += loss.item()
_, predicted = torch.max(output.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return total_loss / len(data_loader), correct / total
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
train_loss = train(model, train_data_loader, optimizer, criterion)
test_loss, test_acc = evaluate(model, test_data_loader, criterion)
print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
以上代码中,首先我们使用get_tokenizer函数定义了一个分词器,然后自定义了一个数据集类ChineseTextClassificationDataset用于加载和预处理数据。接着使用read_data函数读取数据集,使用分词器对文本进行分词,并构建了词汇表。然后,使用BucketIterator函数定义了数据加载器,将文本转换为整数序列,并根据序列长度进行了切分和排序。接下来,定义了一个文本分类模型TextClassifier,使用Embedding层和LSTM层来进行特征提取和分类。最后,定义了训练函数train和评估函数evaluate来进行模型的训练和评估。在训练过程中,使用了CrossEntropyLoss作为损失函数,使用了Adam优化器来更新模型的参数。
这是一个简单的示例代码,你可以根据自己的需要进行修改和优化。希望可以帮助到你!
