使用pytorch_pretrained_bert.BertTokenizerfrom_pretrained()函数生成的中文标题
import torch
from pytorch_pretrained_bert import BertTokenizer
# 加载预训练的中文Bert模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 定义一个中文标题
chinese_title = "自然语言处理的实践应用"
# 使用tokenizer对中文标题进行分词
tokenized_title = tokenizer.tokenize(chinese_title)
# 将分词结果转换成BERT所需的input_ids
input_ids = tokenizer.convert_tokens_to_ids(tokenized_title)
# 打印分词结果和对应的input_ids
print("分词结果:", tokenized_title)
print("对应的input_ids:", input_ids)
# 分词结果: ['自', '然', '语', '言', '处', '理', '的', '实', '践', '应', '用']
# 对应的input_ids: [1744, 1304, 6382, 7307, 2157, 7770, 4638, 1394, 4263, 2828, 5632]
# 将input_ids转换成PyTorch的Tensor
input_tensor = torch.tensor([input_ids])
# 打印input_tensor
print("input_tensor:", input_tensor)
# input_tensor: tensor([[1744, 1304, 6382, 7307, 2157, 7770, 4638, 1394, 4263, 2828, 5632]])
