TensorFlow中的embedding_ops模块在机器翻译中的应用
machine_translation_helpers.py and machine_translation_train.py
"""
This file contains helper functions for building and training a machine translation model using TensorFlow.
"""
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
def load_vocab(filename):
"""
Load vocabulary file.
Args:
filename: Path to the vocabulary file.
Returns:
A tuple (vocab_to_id, id_to_vocab), where vocab_to_id is a dictionary mapping word to id,
and id_to_vocab is a list mapping id to word.
"""
vocab = []
with tf.gfile.GFile(filename, mode="r") as f:
for line in f:
vocab.append(line.strip())
vocab_to_id = {vocab[i]: i for i in range(len(vocab))}
id_to_vocab = vocab
return vocab_to_id, id_to_vocab
def create_vocab_tables(src_vocab_file, tgt_vocab_file):
"""
Create tables to map source and target words to ids.
Args:
src_vocab_file: Path to the source vocabulary file.
tgt_vocab_file: Path to the target vocabulary file.
Returns:
A tuple (src_vocab_table, tgt_vocab_table), where src_vocab_table is a Lookup table to map
source words to ids, and tgt_vocab_table is a Lookup table to map target words to ids.
"""
src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=0)
tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=0)
return src_vocab_table, tgt_vocab_table
def convert_to_zero_padded_batch(inputs, src_vocab_table, tgt_vocab_table, batch_size):
"""
Convert input sentences to zero-padded batch of ids.
Args:
inputs: List of sentences.
src_vocab_table: A Lookup table to map source words to ids.
tgt_vocab_table: A Lookup table to map target words to ids.
batch_size: Size of each batch.
Returns:
A tuple (src_input, tgt_input) where src_input is a zero-padded tensor of source word ids, and
tgt_input is a zero-padded tensor of target word ids.
"""
src_inputs = [tf.string_split([input_sentence]).values for input_sentence in inputs]
tgt_inputs = src_inputs
src_input_lengths = tf.to_int32(tf.map_fn(tf.size, src_inputs))
tgt_input_lengths = src_input_lengths
src_input = lookup_ops.index_table_from_tensor(src_vocab_table)(src_inputs)
tgt_input = lookup_ops.index_table_from_tensor(tgt_vocab_table)(tgt_inputs)
src_input_pad = tf.pad(src_input, [[0, 0], [0, tf.reduce_max(src_input_lengths)]])
tgt_input_pad = tf.pad(tgt_input, [[0, 0], [0, tf.reduce_max(tgt_input_lengths)]])
src_input_len_pad = tf.pad(src_input_lengths, [[0, 0], [0, 1]])
tgt_input_len_pad = tf.pad(tgt_input_lengths, [[0, 0], [0, 1]])
batched_src_input = tf.reshape(src_input_pad, [batch_size, -1])
batched_tgt_input = tf.reshape(tgt_input_pad, [batch_size, -1])
batched_src_input_len = tf.reshape(src_input_len_pad, [batch_size, 1])
batched_tgt_input_len = tf.reshape(tgt_input_len_pad, [batch_size, 1])
src_init_ops = tf.get_collection("src_init_ops")
tgt_init_ops = tf.get_collection("tgt_init_ops")
src_init_ops.append(src_vocab_table.init_ops)
tgt_init_ops.append(tgt_vocab_table.init_ops)
return batched_src_input, batched_tgt_input, batched_src_input_len, batched_tgt_input_len, src_init_ops, tgt_init_ops
def create_embedding(input, vocab_size, embed_size):
"""
Create an embedding matrix for the input tensor.
Args:
input: A tensor containing word ids.
vocab_size: Size of the vocabulary.
embed_size: Size of each word embedding.
Returns:
A tensor of word embeddings.
"""
embedding = tf.get_variable("embedding", [vocab_size, embed_size], dtype=tf.float32)
return tf.nn.embedding_lookup(embedding, input)
def create_rnn_cell(hidden_size, num_layers, cell_type="LSTM"):
"""
Create a RNN cell.
Args:
hidden_size: Size of the hidden state.
num_layers: Number of RNN layers.
cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".
Returns:
An RNN cell.
"""
if cell_type == "LSTM":
cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
elif cell_type == "GRU":
cell = tf.nn.rnn_cell.GRUCell(hidden_size)
elif cell_type == "BasicRNN":
cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
else:
raise ValueError("Unsupported RNN cell type: %s" % cell_type)
return tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
def create_encoder(input, input_len, hidden_size, num_layers, cell_type="LSTM"):
"""
Create an encoder RNN.
Args:
input: A tensor containing word ids.
input_len: Lengths of input sentences.
hidden_size: Size of the hidden state.
num_layers: Number of RNN layers.
cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".
Returns:
A tuple (outputs, state), where outputs is a tensor containing encoded sentences,
and state is the final state of the encoder.
"""
cell = create_rnn_cell(hidden_size, num_layers, cell_type)
outputs, state = tf.nn.dynamic_rnn(cell, input, sequence_length=input_len, dtype=tf.float32)
return outputs, state
def create_decoder_cell(hidden_size, num_layers, cell_type="LSTM"):
"""
Create a decoder RNN cell.
Args:
hidden_size: Size of the hidden state.
num_layers: Number of RNN layers.
cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".
Returns:
An RNN cell.
"""
cell = create_rnn_cell(hidden_size, num_layers, cell_type)
return tf.contrib.rnn.AttentionWrapper(cell, attention_mechanism)
def create_decoder(embedded_tgt_input, tgt_input_len, src_state, hidden_size, num_layers, cell_type="LSTM"):
"""
Create a decoder RNN.
Args:
embedded_tgt_input: A tensor containing embedded target word ids.
tgt_input_len: Lengths of target sentences.
src_state: Final state of the encoder.
hidden_size: Size of the hidden state.
num_layers: Number of RNN layers.
cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".
Returns:
A tuple (outputs, state), where outputs is a tensor containing decoded sentences,
and state is the final state of the decoder.
"""
cell = create_decoder_cell(hidden_size, num_layers, cell_type)
outputs, state = tf.nn.dynamic_rnn(cell, embedded_tgt_input, sequence_length=tgt_input_len, initial_state=src_state, dtype=tf.float32)
return outputs, state
def create_loss(logits, targets, target_len):
"""
Create a loss function.
Args:
logits: Logits tensor returned by the decoder.
targets: Target word ids.
target_len: Lengths of target sentences.
Returns:
The loss value.
"""
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits)
target_weights = tf.sequence_mask(target_len, dtype=tf.float32)
loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(tf.reduce_sum(target_len))
return loss
def create_optimizer(loss, learning_rate, num_steps, decay_steps=None, decay_rate=None):
"""
Create an optimizer.
Args:
loss: Loss tensor.
learning_rate: Learning rate.
num_steps: Number of training steps.
decay_steps: Number of steps for learning rate decay.
decay_rate: Learning rate decay rate.
Returns:
The optimizer.
"""
global_step = tf.Variable(0, trainable=False)
if decay_steps and decay_rate:
learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(loss, global_step=global_step)
def create_train_op(loss, learning_rate, num_steps, decay_steps=None, decay_rate=None):
"""
Create a training operation.
Args:
loss: Loss tensor.
learning_rate: Learning rate.
num_steps: Number of training steps.
decay_steps: Number of steps for learning rate decay.
decay_rate: Learning rate decay rate.
Returns:
The training operation.
"""
train_op = create_optimizer(loss, learning_rate, num_steps, decay_steps, decay_rate)
tf.add_to_collection("train_op", train_op)
return tf.get_collection("train_op")[0]
def create_prediction
