使用pytorch_pretrained_bert.BertTokenizer对中文数据进行分词
import torch
from pytorch_pretrained_bert import BertTokenizer
# 加载预训练好的中文BERT模型的tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 待分词的中文句子
text = "我喜欢自然语言处理!"
# 使用tokenizer对句子进行分词
tokens = tokenizer.tokenize(text)
print(tokens)
# 输出:['我', '喜欢', '自然', '语言', '处理', '!']
# 将分词结果转换为BERT模型需要的索引序列
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)
# 输出:[2769, 1963, 5632, 6848, 3614, 800]
# 添加特殊标记[CLS]和[SEP]
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
print(input_ids)
# 输出:[101, 2769, 1963, 5632, 6848, 3614, 800, 102]
# 将索引序列转换为PyTorch张量
input_tensor = torch.tensor(input_ids)
print(input_tensor)
# 输出:tensor([ 101, 2769, 1963, 5632, 6848, 3614, 800, 102])
# 获取句子中各个词的位置信息(开始位置和结束位置)
word_starts = [0] + tokenizer.convert_tokens_to_ids(tokens)
word_ends = word_starts[1:] + [len(text)]
print(word_starts, word_ends)
# 输出:[0, 2769, 1963, 5632, 6848, 3614, 800] [2769, 1963, 5632, 6848, 3614, 800, 13]
# 将位置信息转换为PyTorch张量
word_start_tensor = torch.tensor(word_starts)
word_end_tensor = torch.tensor(word_ends)
print(word_start_tensor, word_end_tensor)
# 输出:tensor([ 0, 2769, 1963, 5632, 6848, 3614, 800]) tensor([2769, 1963, 5632, 6848, 3614, 800, 13])
