The GPT style transformer uses only the decoder part of a Transformer.
The input sequence shifted by -1 is the output sequence.
Make sure output sequence is masked for future positions.
# A simple decoder-only Transformer model using the existing torch.nn.Transformer implementation
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class GPTDecoder(nn.Module):
def __init__(self, input_dim, model_dim, nhead, num_layers):
super(GPTDecoder, self).__init__()
self.model_dim = model_dim
# Embedding layer
self.embedding = nn.Embedding(
num_embeddings=input_dim, # Size of the vocabulary
embedding_dim=model_dim # Dimension of each embedding vector
)
# Transformer decoder layer
decoder_layer = nn.TransformerDecoderLayer(
d_model=model_dim,
nhead=nhead,
dim_feedforward=128,
dropout=0.1,
activation='relu'
)
self.transformer_decoder = nn.TransformerDecoder(
decoder_layer,
num_layers=num_layers
)
# Linear layer to project the decoder output to vocabulary size
self.fc_out = nn.Linear(model_dim, input_dim)
# def create_lookahead_mask(self, size):
# """
# Creates a look-ahead mask for the target sequence.
# This is just for showing how it is done.
# Use nn.Transformer.generate_square_subsequent_mask() for one liner
# """
# mask = torch.triu(torch.ones(size, size), diagonal=1).float()
# mask = mask.masked_fill(mask == 1, float('-inf'))
# mask = mask.masked_fill(mask == 0, float(0.0))
# return mask
def forward(self, tgt):
# Embedding and scaling
tgt = self.embedding(tgt) * np.sqrt(self.model_dim)
# Create look-ahead mask for the target sequence
# tgt_mask = self.create_lookahead_mask(len(tgt)).to(tgt.device)
tgt_mask = nn.Transformer.generate_square_subsequent_mask(len(tgt)).to(tgt.device)
# Pass through transformer decoder
output = self.transformer_decoder(tgt, tgt, tgt_mask=tgt_mask)
output = self.fc_out(output)
return output
# Parameters
input_dim = 10 # Size of vocabulary
model_dim = 32 # Embedding dimension
nhead = 4 # Number of attention heads
num_layers = 2 # Number of transformer decoder layers
seq_len = 5 # Length of the sequence
# Initialize the model, optimizer, and loss function
model = GPTDecoder(input_dim, model_dim, nhead, num_layers)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# Generate random sequences for demo
def generate_random_sequence(batch_size, seq_len, input_dim):
return torch.randint(0, input_dim, (batch_size, seq_len))
# Use random sequences of batch size = 3 for demo
batch_size = 3
tgt = generate_random_sequence(batch_size, seq_len, input_dim)
# Shift target sequence by 1 for training
tgt_input = tgt[:, :-1] # Input to the model (exclude the last token)
tgt_target = tgt[:, 1:] # Target for the model (shifted sequence)
# Forward pass through the model
output = model(tgt_input)
# Calculate loss
loss = criterion(output.view(-1, input_dim), tgt_target.contiguous().view(-1))
print(f'Loss: {loss.item()}')
# Print the output
print("Output shape:", output.shape)
print("Output:", output)