Sequence to Sequence Learning with Neural Networks (12/12/2014): differenze tra le versioni
Nessun oggetto della modifica |
Nessun oggetto della modifica |
||
(Una versione intermedia di uno stesso utente non è mostrata) | |||
Riga 1: | Riga 1: | ||
Famoso paper introduttivo dei modelli ''sequence-to-sequence'' per i task di [[traduzione automatica]] | Famoso paper introduttivo dei modelli ''sequence-to-sequence'' per i task di [[traduzione automatica]], uno degli autori è [[Ilya Sutskever]]. | ||
== Funzionamento in breve == | |||
SI veda https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb#scrollTo=3zCnTFgqQZ5x | |||
L''''encoder''': | |||
* Prende, uno alla volta, un vettore in formato one-hot che rappresenta l'ID nel vocabolario sorgente | |||
* Questo passa in un layer denso nn.Embedding, che aiuterà la rete a "risistemare" gli ID dalla rappresentazione sparsa one-hot a quella densa, semantica, di Embedding | |||
* Passa in (quattro nel paper) layer LSTM, che torna uno stato nascostom che cattura i cambiamenti più recenti, e un "cell state", che cattura le dipendenze a lungo termine | |||
class Encoder(nn.Module): | |||
def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout): | |||
super().__init__() | |||
self.hidden_dim = hidden_dim | |||
self.n_layers = n_layers | |||
self.embedding = nn.Embedding(input_dim, embedding_dim) | |||
self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout) | |||
self.dropout = nn.Dropout(dropout) | |||
def forward(self, src): | |||
# src = [src length, batch size] | |||
embedded = self.dropout(self.embedding(src)) | |||
# embedded = [src length, batch size, embedding dim] | |||
outputs, (hidden, cell) = self.rnn(embedded) | |||
# outputs = [src length, batch size, hidden dim * n directions] | |||
# hidden = [n layers * n directions, batch size, hidden dim] | |||
# cell = [n layers * n directions, batch size, hidden dim] | |||
# outputs are always from the top hidden layer | |||
return hidden, cell | |||
Il '''decoder''': | |||
* Riceve l'input, che è <start or sentence> oppure l'ultimo token generato | |||
* Riceve hidden e cell state e li inserisce nel (nei) layer LSTM. Alla prima iterazione questi sono quelli del decoder. | |||
class Decoder(nn.Module): | |||
def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout): | |||
super().__init__() | |||
self.output_dim = output_dim | |||
self.hidden_dim = hidden_dim | |||
self.n_layers = n_layers | |||
self.embedding = nn.Embedding(output_dim, embedding_dim) | |||
self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout) | |||
self.fc_out = nn.Linear(hidden_dim, output_dim) | |||
self.dropout = nn.Dropout(dropout) | |||
def forward(self, input, hidden, cell): | |||
# input = [batch size] | |||
# hidden = [n layers * n directions, batch size, hidden dim] | |||
# cell = [n layers * n directions, batch size, hidden dim] | |||
# n directions in the decoder will both always be 1, therefore: | |||
# hidden = [n layers, batch size, hidden dim] | |||
# context = [n layers, batch size, hidden dim] | |||
input = input.unsqueeze(0) | |||
# input = [1, batch size] | |||
embedded = self.dropout(self.embedding(input)) | |||
# embedded = [1, batch size, embedding dim] | |||
output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) | |||
# output = [seq length, batch size, hidden dim * n directions] | |||
# hidden = [n layers * n directions, batch size, hidden dim] | |||
# cell = [n layers * n directions, batch size, hidden dim] | |||
# seq length and n directions will always be 1 in this decoder, therefore: | |||
# output = [1, batch size, hidden dim] | |||
# hidden = [n layers, batch size, hidden dim] | |||
# cell = [n layers, batch size, hidden dim] | |||
prediction = self.fc_out(output.squeeze(0)) | |||
# prediction = [batch size, output dim] | |||
return prediction, hidden, cell | |||
class Seq2Seq(nn.Module): | |||
def __init__(self, encoder, decoder, device): | |||
super().__init__() | |||
self.encoder = encoder | |||
self.decoder = decoder | |||
self.device = device | |||
assert ( | |||
encoder.hidden_dim == decoder.hidden_dim | |||
), "Hidden dimensions of encoder and decoder must be equal!" | |||
assert ( | |||
encoder.n_layers == decoder.n_layers | |||
), "Encoder and decoder must have equal number of layers!" | |||
def forward(self, src, trg, teacher_forcing_ratio): | |||
# src = [src length, batch size] | |||
# trg = [trg length, batch size] | |||
# teacher_forcing_ratio is probability to use teacher forcing | |||
# e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time | |||
batch_size = trg.shape[1] | |||
trg_length = trg.shape[0] | |||
trg_vocab_size = self.decoder.output_dim | |||
# tensor to store decoder outputs | |||
outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device) | |||
# last hidden state of the encoder is used as the initial hidden state of the decoder | |||
hidden, cell = self.encoder(src) | |||
# hidden = [n layers * n directions, batch size, hidden dim] | |||
# cell = [n layers * n directions, batch size, hidden dim] | |||
# first input to the decoder is the <sos> tokens | |||
input = trg[0, :] | |||
# input = [batch size] | |||
for t in range(1, trg_length): | |||
# insert input token embedding, previous hidden and previous cell states | |||
# receive output tensor (predictions) and new hidden and cell states | |||
output, hidden, cell = self.decoder(input, hidden, cell) | |||
# output = [batch size, output dim] | |||
# hidden = [n layers, batch size, hidden dim] | |||
# cell = [n layers, batch size, hidden dim] | |||
# place predictions in a tensor holding predictions for each token | |||
outputs[t] = output | |||
# decide if we are going to use teacher forcing or not | |||
teacher_force = random.random() < teacher_forcing_ratio | |||
# get the highest predicted token from our predictions | |||
top1 = output.argmax(1) | |||
# if teacher forcing, use actual next token as next input | |||
# if not, use predicted token | |||
input = trg[t] if teacher_force else top1 | |||
# input = [batch size] | |||
return outputs | |||
[[File:Seq2seqLearning.png|nessuno|miniatura|600x600px]] | |||
=== Link === | === Link === | ||
https://arxiv.org/abs/1409.3215 | https://arxiv.org/abs/1409.3215 | ||
https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb#scrollTo=3zCnTFgqQZ5x | |||
[https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb Implementazione pytorch per traduzione inglese-tedesco] | [https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb Implementazione pytorch per traduzione inglese-tedesco] |
Versione attuale delle 08:52, 2 dic 2024
Famoso paper introduttivo dei modelli sequence-to-sequence per i task di traduzione automatica, uno degli autori è Ilya Sutskever.
Funzionamento in breve
L'encoder:
- Prende, uno alla volta, un vettore in formato one-hot che rappresenta l'ID nel vocabolario sorgente
- Questo passa in un layer denso nn.Embedding, che aiuterà la rete a "risistemare" gli ID dalla rappresentazione sparsa one-hot a quella densa, semantica, di Embedding
- Passa in (quattro nel paper) layer LSTM, che torna uno stato nascostom che cattura i cambiamenti più recenti, e un "cell state", che cattura le dipendenze a lungo termine
class Encoder(nn.Module): def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout): super().__init__() self.hidden_dim = hidden_dim self.n_layers = n_layers self.embedding = nn.Embedding(input_dim, embedding_dim) self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout) self.dropout = nn.Dropout(dropout) def forward(self, src): # src = [src length, batch size] embedded = self.dropout(self.embedding(src)) # embedded = [src length, batch size, embedding dim] outputs, (hidden, cell) = self.rnn(embedded) # outputs = [src length, batch size, hidden dim * n directions] # hidden = [n layers * n directions, batch size, hidden dim] # cell = [n layers * n directions, batch size, hidden dim] # outputs are always from the top hidden layer return hidden, cell
Il decoder:
- Riceve l'input, che è <start or sentence> oppure l'ultimo token generato
- Riceve hidden e cell state e li inserisce nel (nei) layer LSTM. Alla prima iterazione questi sono quelli del decoder.
class Decoder(nn.Module): def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout): super().__init__() self.output_dim = output_dim self.hidden_dim = hidden_dim self.n_layers = n_layers self.embedding = nn.Embedding(output_dim, embedding_dim) self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout) self.fc_out = nn.Linear(hidden_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, input, hidden, cell): # input = [batch size] # hidden = [n layers * n directions, batch size, hidden dim] # cell = [n layers * n directions, batch size, hidden dim] # n directions in the decoder will both always be 1, therefore: # hidden = [n layers, batch size, hidden dim] # context = [n layers, batch size, hidden dim] input = input.unsqueeze(0) # input = [1, batch size] embedded = self.dropout(self.embedding(input)) # embedded = [1, batch size, embedding dim] output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) # output = [seq length, batch size, hidden dim * n directions] # hidden = [n layers * n directions, batch size, hidden dim] # cell = [n layers * n directions, batch size, hidden dim] # seq length and n directions will always be 1 in this decoder, therefore: # output = [1, batch size, hidden dim] # hidden = [n layers, batch size, hidden dim] # cell = [n layers, batch size, hidden dim] prediction = self.fc_out(output.squeeze(0)) # prediction = [batch size, output dim] return prediction, hidden, cell class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device assert ( encoder.hidden_dim == decoder.hidden_dim ), "Hidden dimensions of encoder and decoder must be equal!" assert ( encoder.n_layers == decoder.n_layers ), "Encoder and decoder must have equal number of layers!" def forward(self, src, trg, teacher_forcing_ratio): # src = [src length, batch size] # trg = [trg length, batch size] # teacher_forcing_ratio is probability to use teacher forcing # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time batch_size = trg.shape[1] trg_length = trg.shape[0] trg_vocab_size = self.decoder.output_dim # tensor to store decoder outputs outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device) # last hidden state of the encoder is used as the initial hidden state of the decoder hidden, cell = self.encoder(src) # hidden = [n layers * n directions, batch size, hidden dim] # cell = [n layers * n directions, batch size, hidden dim] # first input to the decoder is the <sos> tokens input = trg[0, :] # input = [batch size] for t in range(1, trg_length): # insert input token embedding, previous hidden and previous cell states # receive output tensor (predictions) and new hidden and cell states output, hidden, cell = self.decoder(input, hidden, cell) # output = [batch size, output dim] # hidden = [n layers, batch size, hidden dim] # cell = [n layers, batch size, hidden dim] # place predictions in a tensor holding predictions for each token outputs[t] = output # decide if we are going to use teacher forcing or not teacher_force = random.random() < teacher_forcing_ratio # get the highest predicted token from our predictions top1 = output.argmax(1) # if teacher forcing, use actual next token as next input # if not, use predicted token input = trg[t] if teacher_force else top1 # input = [batch size] return outputs

Link
https://arxiv.org/abs/1409.3215