TensorFlow中的embedding_ops模块在机器翻译中的应用

发布时间：2023-12-24 03:45:07

machine_translation_helpers.py and machine_translation_train.py

"""

This file contains helper functions for building and training a machine translation model using TensorFlow.

"""

import tensorflow as tf

from tensorflow.python.ops import lookup_ops

def load_vocab(filename):

"""

Load vocabulary file.

Args:

filename: Path to the vocabulary file.

Returns:

A tuple (vocab_to_id, id_to_vocab), where vocab_to_id is a dictionary mapping word to id,

and id_to_vocab is a list mapping id to word.

"""

vocab = []

with tf.gfile.GFile(filename, mode="r") as f:

for line in f:

vocab.append(line.strip())

vocab_to_id = {vocab[i]: i for i in range(len(vocab))}

id_to_vocab = vocab

return vocab_to_id, id_to_vocab

def create_vocab_tables(src_vocab_file, tgt_vocab_file):

"""

Create tables to map source and target words to ids.

Args:

src_vocab_file: Path to the source vocabulary file.

tgt_vocab_file: Path to the target vocabulary file.

Returns:

A tuple (src_vocab_table, tgt_vocab_table), where src_vocab_table is a Lookup table to map

source words to ids, and tgt_vocab_table is a Lookup table to map target words to ids.

"""

src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=0)

tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=0)

return src_vocab_table, tgt_vocab_table

def convert_to_zero_padded_batch(inputs, src_vocab_table, tgt_vocab_table, batch_size):

"""

Convert input sentences to zero-padded batch of ids.

Args:

inputs: List of sentences.

src_vocab_table: A Lookup table to map source words to ids.

tgt_vocab_table: A Lookup table to map target words to ids.

batch_size: Size of each batch.

Returns:

A tuple (src_input, tgt_input) where src_input is a zero-padded tensor of source word ids, and

tgt_input is a zero-padded tensor of target word ids.

"""

src_inputs = [tf.string_split([input_sentence]).values for input_sentence in inputs]

tgt_inputs = src_inputs

src_input_lengths = tf.to_int32(tf.map_fn(tf.size, src_inputs))

tgt_input_lengths = src_input_lengths

src_input = lookup_ops.index_table_from_tensor(src_vocab_table)(src_inputs)

tgt_input = lookup_ops.index_table_from_tensor(tgt_vocab_table)(tgt_inputs)

src_input_pad = tf.pad(src_input, [[0, 0], [0, tf.reduce_max(src_input_lengths)]])

tgt_input_pad = tf.pad(tgt_input, [[0, 0], [0, tf.reduce_max(tgt_input_lengths)]])

src_input_len_pad = tf.pad(src_input_lengths, [[0, 0], [0, 1]])

tgt_input_len_pad = tf.pad(tgt_input_lengths, [[0, 0], [0, 1]])

batched_src_input = tf.reshape(src_input_pad, [batch_size, -1])

batched_tgt_input = tf.reshape(tgt_input_pad, [batch_size, -1])

batched_src_input_len = tf.reshape(src_input_len_pad, [batch_size, 1])

batched_tgt_input_len = tf.reshape(tgt_input_len_pad, [batch_size, 1])

src_init_ops = tf.get_collection("src_init_ops")

tgt_init_ops = tf.get_collection("tgt_init_ops")

src_init_ops.append(src_vocab_table.init_ops)

tgt_init_ops.append(tgt_vocab_table.init_ops)

return batched_src_input, batched_tgt_input, batched_src_input_len, batched_tgt_input_len, src_init_ops, tgt_init_ops

def create_embedding(input, vocab_size, embed_size):

"""

Create an embedding matrix for the input tensor.

Args:

input: A tensor containing word ids.

vocab_size: Size of the vocabulary.

embed_size: Size of each word embedding.

Returns:

A tensor of word embeddings.

"""

embedding = tf.get_variable("embedding", [vocab_size, embed_size], dtype=tf.float32)

return tf.nn.embedding_lookup(embedding, input)

def create_rnn_cell(hidden_size, num_layers, cell_type="LSTM"):

"""

Create a RNN cell.

Args:

hidden_size: Size of the hidden state.

num_layers: Number of RNN layers.

cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".

Returns:

An RNN cell.

"""

if cell_type == "LSTM":

cell = tf.nn.rnn_cell.LSTMCell(hidden_size)

elif cell_type == "GRU":

cell = tf.nn.rnn_cell.GRUCell(hidden_size)

elif cell_type == "BasicRNN":

cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)

else:

raise ValueError("Unsupported RNN cell type: %s" % cell_type)

return tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)

def create_encoder(input, input_len, hidden_size, num_layers, cell_type="LSTM"):

"""

Create an encoder RNN.

Args:

input: A tensor containing word ids.

input_len: Lengths of input sentences.

hidden_size: Size of the hidden state.

num_layers: Number of RNN layers.

cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".

Returns:

A tuple (outputs, state), where outputs is a tensor containing encoded sentences,

and state is the final state of the encoder.

"""

cell = create_rnn_cell(hidden_size, num_layers, cell_type)

outputs, state = tf.nn.dynamic_rnn(cell, input, sequence_length=input_len, dtype=tf.float32)

return outputs, state

def create_decoder_cell(hidden_size, num_layers, cell_type="LSTM"):

"""

Create a decoder RNN cell.

Args:

hidden_size: Size of the hidden state.

num_layers: Number of RNN layers.

cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".

Returns:

An RNN cell.

"""

cell = create_rnn_cell(hidden_size, num_layers, cell_type)

return tf.contrib.rnn.AttentionWrapper(cell, attention_mechanism)

def create_decoder(embedded_tgt_input, tgt_input_len, src_state, hidden_size, num_layers, cell_type="LSTM"):

"""

Create a decoder RNN.

Args:

embedded_tgt_input: A tensor containing embedded target word ids.

tgt_input_len: Lengths of target sentences.

src_state: Final state of the encoder.

hidden_size: Size of the hidden state.

num_layers: Number of RNN layers.

cell_type: Type of RNN cell, possible values are "LSTM", "GRU", and "BasicRNN".

Returns:

A tuple (outputs, state), where outputs is a tensor containing decoded sentences,

and state is the final state of the decoder.

"""

cell = create_decoder_cell(hidden_size, num_layers, cell_type)

outputs, state = tf.nn.dynamic_rnn(cell, embedded_tgt_input, sequence_length=tgt_input_len, initial_state=src_state, dtype=tf.float32)

return outputs, state

def create_loss(logits, targets, target_len):

"""

Create a loss function.

Args:

logits: Logits tensor returned by the decoder.

targets: Target word ids.

target_len: Lengths of target sentences.

Returns:

The loss value.

"""

crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits)

target_weights = tf.sequence_mask(target_len, dtype=tf.float32)

loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(tf.reduce_sum(target_len))

return loss

def create_optimizer(loss, learning_rate, num_steps, decay_steps=None, decay_rate=None):

"""

Create an optimizer.

Args:

loss: Loss tensor.

learning_rate: Learning rate.

num_steps: Number of training steps.

decay_steps: Number of steps for learning rate decay.

decay_rate: Learning rate decay rate.

Returns:

The optimizer.

"""

global_step = tf.Variable(0, trainable=False)

if decay_steps and decay_rate:

learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=True)

optimizer = tf.train.AdamOptimizer(learning_rate)

return optimizer.minimize(loss, global_step=global_step)

def create_train_op(loss, learning_rate, num_steps, decay_steps=None, decay_rate=None):

"""

Create a training operation.

Args:

loss: Loss tensor.

learning_rate: Learning rate.

num_steps: Number of training steps.

decay_steps: Number of steps for learning rate decay.

decay_rate: Learning rate decay rate.

Returns:

The training operation.

"""

train_op = create_optimizer(loss, learning_rate, num_steps, decay_steps, decay_rate)

tf.add_to_collection("train_op", train_op)

return tf.get_collection("train_op")[0]

def create_prediction