example - transformer - existing torch module

#A simple transfomer model using the existing torch.nn.Transformer implementation

import torch

import torch.nn as nn

import torch.optim as optim

import numpy as np

class SimpleTransformer(nn.Module):

def __init__(self, input_dim, model_dim, nhead, num_encoder_layers, num_decoder_layers):

super(SimpleTransformer, self).__init__()

# specify the embedding vector size, which is also the transformer's input vector size

self.model_dim = model_dim

#the embedding module for converting a token/word to an embedding vector

self.embedding = nn.Embedding(

num_embeddings = input_dim, #size of the dictionary of embeddings, i.e. the size of vocabulary

embedding_dim = model_dim # the size of each embedding vector

)

#stack up a transformer module with the given required layers

self.transformer = nn.Transformer(

d_model=model_dim, #the number of expected features in the encoder/decoder inputs, default=512

nhead=nhead, #the number of heads in the multiheadattention, default=8

num_encoder_layers=num_encoder_layers, # the number of sub-encoder-layers in the encoder (default=6)

num_decoder_layers=num_decoder_layers, #the number of sub-decoder-layers in the decoder (default=6)

dim_feedforward = 128, #the dimension of the feedforward network model (default=2048)

dropout = 0.1, #the dropout value (default=0.1)

activation = 'relu' # 'gelu' / 'relu' / a callable function, default relu

)

self.fc_out = nn.Linear(model_dim, input_dim)

def forward(self, src, tgt):

# the np.sqrt(self.model_dim) is a scaling factor

# Scaling the embeddings by the square root of the model dimension is a practice borrowed from the original Transformer paper (Attention is All You Need)

# to ensure that the embeddings have a similar scale to the attention mechanism's inputs, which helps stabilize training

src = self.embedding(src) * np.sqrt(self.model_dim)

tgt = self.embedding(tgt) * np.sqrt(self.model_dim)

#generate the look ahead mask

#for target sequence, future tokens after current token should be masked

#for source sequence, the mask is a padding mask. Source sequences can be of different lengths and be padded.

src_mask = self.transformer.generate_square_subsequent_mask(len(src)).to(src.device)

tgt_mask = self.transformer.generate_square_subsequent_mask(len(tgt)).to(tgt.device)

#pass the source and target sequences through transformer, with the masks

output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)

output = self.fc_out(output)

return output

# Parameters

input_dim = 10 # Size of vocabulary

model_dim = 32 # Embedding dimension

nhead = 4 # Number of attention heads

num_encoder_layers = 2

num_decoder_layers = 2

seq_len = 5 # Length of the sequence

# Initialize the model, optimizer, and loss function

model = SimpleTransformer(input_dim, model_dim, nhead, num_encoder_layers, num_decoder_layers)

optimizer = optim.Adam(model.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss()

# Generate random sequences for demo

def generate_random_sequence(batch_size, seq_len, input_dim):

return torch.randint(0, input_dim, (batch_size, seq_len))

#use random sequneces of batch size = 3 for demo

batch_size = 3

src = generate_random_sequence(batch_size, seq_len, input_dim)

tgt = generate_random_sequence(batch_size, seq_len, input_dim)

# Forward pass through the model

output = model(src, tgt)

# Calculate loss (for demonstration purposes, use the target sequence as labels)