SingleIdTokenIndexer()与Bert模型在中文文本分类中的性能比较
SingleIdTokenIndexer和Bert模型是自然语言处理中常用的工具,用于处理中文文本分类任务。下面将对它们进行性能比较,并给出一个使用例子。
1. SingleIdTokenIndexer
SingleIdTokenIndexer是AllenNLP中的一个TokenIndexer,用于将文本转换为整数索引。它会将每个token映射为一个唯一的整数ID,并将输入文本转换为整数序列。单独使用SingleIdTokenIndexer在中文文本分类任务上性能较差,因为它不能捕捉到词语的语义信息和上下文信息。
使用例子:
from allennlp.data import TextFieldTensors, Vocabulary
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.models import Model
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import Trainer
import torch
import torch.optim as optim
class SimpleClassifier(Model):
def __init__(self, vocab: Vocabulary, embedder: BasicTextFieldEmbedder):
super().__init__(vocab)
self.embedder = embedder
self.encoder = BagOfEmbeddingsEncoder(embedder.get_output_dim(), averaged=True)
self.linear = torch.nn.Linear(self.encoder.get_output_dim(), vocab.get_vocab_size('labels'))
self.accuracy = CategoricalAccuracy()
def forward(self, tokens: TextFieldTensors, label: torch.Tensor = None) -> torch.Tensor:
embedded = self.embedder(tokens)
mask = self.get_mask_from_tokens(tokens['tokens'])
encoded = self.encoder(embedded, mask)
logits = self.linear(encoded)
output = {"logits": logits}
if label is not None:
self.accuracy(logits, label)
output["loss"] = torch.nn.functional.cross_entropy(logits, label)
return output
def get_mask_from_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
# Generate a mask from the text field, with 0s for padding tokens and 1s for non-padding tokens.
mask = tokens != 0
return mask
token_indexers = {'tokens': SingleIdTokenIndexer()}
tokenizer = WordTokenizer()
train_texts = ['这是一个好消息', '这是一个坏消息']
train_labels = ['positive', 'negative']
train_instances = []
for text, label in zip(train_texts, train_labels):
tokens = tokenizer.tokenize(text)
instance = {"tokens": tokens, "label": label}
train_instances.append(instance)
vocab = Vocabulary.from_instances(train_instances)
train_instances = [Instance(instance) for instance in train_instances]
train_dataset = AllennlpDataset(train_instances, vocab)
iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)
embedder = BasicTextFieldEmbedder({"tokens": Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=20)})
model = SimpleClassifier(vocab, embedder)
optimizer = optim.SGD(model.parameters(), lr=0.1)
trainer = Trainer(model=model,
optimizer=optimizer,
iterator=iterator,
train_dataset=train_dataset,
num_epochs=10)
trainer.train()
以上示例中,使用了一个简单的Bag of Embeddings模型和SingleIdTokenIndexer来进行中文文本分类。该模型将输入文本中的所有token进行嵌入,然后通过使用Bag of Embeddings编码器得到文本表示。这个表示经过线性层得到分类目标的logits,然后使用交叉熵损失进行训练。其中使用了CategoricalAccuracy评估模型的准确率。
2. Bert模型
BERT(Bidirectional Encoder Representations from Transformers)是一种预训练的双向Transformer编码器,在很多NLP任务上取得了很好的性能。它通过使用双向语言模型预训练的方式,能够有效地学习到单词和句子的上下文分布式表示。
使用例子:
from allennlp.data import TextFieldTensors, Vocabulary
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.models import Model
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import GradientDescentTrainer
import torch
import torch.optim as optim
class Classifier(Model):
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, hidden_dim: int, num_classes: int):
super().__init__(vocab)
self.embedder = embedder
self.encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(embedder.get_output_dim(), hidden_dim, batch_first=True))
self.linear = torch.nn.Linear(hidden_dim, num_classes)
self.accuracy = CategoricalAccuracy()
def forward(self, tokens: TextFieldTensors, label: torch.Tensor = None) -> torch.Tensor:
embedded = self.embedder(tokens)
mask = torch.ones_like(tokens["tokens"]) # Bert模型不需要mask
output = self.encoder(embedded, mask)
logits = self.linear(output)
output = {"logits": logits}
if label is not None:
self.accuracy(logits, label)
output["loss"] = torch.nn.functional.cross_entropy(logits, label)
return output
tokenizer = WordTokenizer()
train_texts = ['这是一个好消息', '这是一个坏消息']
train_labels = ['positive', 'negative']
train_instances = []
for text, label in zip(train_texts, train_labels):
tokens = tokenizer.tokenize(text)
instance = {"tokens": tokens, "label": label}
train_instances.append(instance)
vocab = Vocabulary.from_instances(train_instances)
train_instances = [Instance(instance) for instance in train_instances]
train_dataset = AllennlpDataset(train_instances, vocab)
iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)
token_indexer = PretrainedTransformerIndexer(model_name="bert-base-chinese")
embedder = BasicTextFieldEmbedder({"tokens": BasicTextFieldEmbedder(token_indexers={'tokens': token_indexer},
embedder=PretrainedTransformerEmbedder(
model_name="bert-base-chinese"))})
model = Classifier(vocab, embedder, hidden_dim=128, num_classes=vocab.get_vocab_size('labels'))
optimizer = optim.Adam(model.parameters())
trainer = GradientDescentTrainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset)
trainer.train()
以上示例中,使用了BERT模型和PretrainedTransformerIndexer来进行中文文本分类。PretrainedTransformerIndexer将每个token映射为BERT模型训练时使用的ID,并将输入文本转换为整数序列。模型使用Bert中文预训练模型作为嵌入层,然后通过一个双向LSTM进行编码,最后进行分类。使用了CategoricalAccuracy评估模型的准确率。
在中文文本分类任务中,使用Bert模型的性能往往会优于单独使用SingleIdTokenIndexer。这是因为Bert模型能够捕捉到词语的语义和上下文信息,能够更好地对中文文本进行建模。
