#A simple transfomer model using the existing torch.nn.Transformer implementation
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class SimpleTransformer(nn.Module):
def __init__(self, input_dim, model_dim, nhead, num_encoder_layers, num_decoder_layers):
super(SimpleTransformer, self).__init__()
# specify the embedding vector size, which is also the transformer's input vector size
self.model_dim = model_dim
#the embedding module for converting a token/word to an embedding vector
self.embedding = nn.Embedding(
num_embeddings = input_dim, #size of the dictionary of embeddings, i.e. the size of vocabulary
embedding_dim = model_dim # the size of each embedding vector
)
#stack up a transformer module with the given required layers
self.transformer = nn.Transformer(
d_model=model_dim, #the number of expected features in the encoder/decoder inputs, default=512
nhead=nhead, #the number of heads in the multiheadattention, default=8
num_encoder_layers=num_encoder_layers, # the number of sub-encoder-layers in the encoder (default=6)
num_decoder_layers=num_decoder_layers, #the number of sub-decoder-layers in the decoder (default=6)
dim_feedforward = 128, #the dimension of the feedforward network model (default=2048)
dropout = 0.1, #the dropout value (default=0.1)
activation = 'relu' # 'gelu' / 'relu' / a callable function, default relu
)
self.fc_out = nn.Linear(model_dim, input_dim)
def forward(self, src, tgt):
# the np.sqrt(self.model_dim) is a scaling factor
# Scaling the embeddings by the square root of the model dimension is a practice borrowed from the original Transformer paper (Attention is All You Need)
# to ensure that the embeddings have a similar scale to the attention mechanism's inputs, which helps stabilize training
src = self.embedding(src) * np.sqrt(self.model_dim)
tgt = self.embedding(tgt) * np.sqrt(self.model_dim)
#generate the look ahead mask
#for target sequence, future tokens after current token should be masked
#for source sequence, the mask is a padding mask. Source sequences can be of different lengths and be padded.
src_mask = self.transformer.generate_square_subsequent_mask(len(src)).to(src.device)
tgt_mask = self.transformer.generate_square_subsequent_mask(len(tgt)).to(tgt.device)
#pass the source and target sequences through transformer, with the masks
output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
output = self.fc_out(output)
return output
# Parameters
input_dim = 10 # Size of vocabulary
model_dim = 32 # Embedding dimension
nhead = 4 # Number of attention heads
num_encoder_layers = 2
num_decoder_layers = 2
seq_len = 5 # Length of the sequence
# Initialize the model, optimizer, and loss function
model = SimpleTransformer(input_dim, model_dim, nhead, num_encoder_layers, num_decoder_layers)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# Generate random sequences for demo
def generate_random_sequence(batch_size, seq_len, input_dim):
return torch.randint(0, input_dim, (batch_size, seq_len))
#use random sequneces of batch size = 3 for demo
batch_size = 3
src = generate_random_sequence(batch_size, seq_len, input_dim)
tgt = generate_random_sequence(batch_size, seq_len, input_dim)
# Forward pass through the model
output = model(src, tgt)
# Calculate loss (for demonstration purposes, use the target sequence as labels)
loss = criterion(output.view(-1, input_dim), tgt.view(-1))
print(f'Loss: {loss.item()}')
# Print the output
print("Output shape:", output.shape)
print("Output:", output)