In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader,random_split
import urllib.request
import os
from transformers import AutoTokenizer, logging
import pandas as pd
from tqdm import tqdm


In [None]:

text = str(urllib.request.urlopen("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt").read())

text = text.lower()


In [None]:
class Tokenizer():
 def __init__(self, text):
 self.pad_token = ""
 self.pad_token_id = 0
 self.itos: dict = {0:""}
 self.stoi: dict = {"":0}
 counter = 1
 for i in text:
 if i in self.stoi:
 continue
 self.stoi[i] = counter
 self.itos[counter] = i
 counter +=1
 def __len__(self):
 return len(self.itos)
 def encode(self, t):
 if isinstance(t, str):

 return [self.stoi[i] for i in t]
 else:
 return [[self.stoi[i] for i in k] for k in t]
 def decode(self, l:torch.tensor):
 return [self.itos[i] for i in l]

tokenizer = Tokenizer(text)
dictionary_size = len(tokenizer)

In [None]:
X = tokenizer.encode("ciao")
tokenizer.decode(X)

['c', 'i', 'a', 'o']

# one head

In [None]:
emb_dim = 3

In [None]:
emb = nn.Embedding(dictionary_size, emb_dim)
X = torch.tensor(X)

In [None]:
X_embedded = emb(X)

In [None]:
# context lenght = 4
# batch sizev = 1
# X --> (1,4,3)
head_size = 3
context_length = 4

In [None]:
Wq = torch.rand((emb_dim, head_size))
Wk = torch.rand((emb_dim, head_size))
Wv = torch.rand((emb_dim, head_size))

In [None]:
X_embedded.shape

torch.Size([4, 3])

In [None]:
Q = X_embedded @ Wq
K = X_embedded @ Wk
V = X_embedded @ Wv

In [None]:
Q.shape, K.shape

(torch.Size([4, 3]), torch.Size([4, 3]))

In [None]:
attention_score = Q @ K.reshape(1,-1,context_length)

In [None]:
attention_score.shape

torch.Size([1, 4, 4])

In [None]:
attention_mask = torch.triu(torch.ones(context_length, context_length), diagonal = 1).bool()

In [None]:
attention_mask
mask = attention_mask.unsqueeze(0).expand(attention_score.size())


In [None]:
attention_score_masked = attention_score.masked_fill(mask,float('-inf'))

In [None]:
attn_weights = attention_score_masked.softmax(dim = -1)

In [None]:
attn_weights

tensor([[[1.0000, 0.0000, 0.0000, 0.0000],
 [0.5995, 0.4005, 0.0000, 0.0000],
 [0.4626, 0.3241, 0.2133, 0.0000],
 [0.2012, 0.1739, 0.4339, 0.1910]]], grad_fn=)

In [None]:
attn_output = attn_weights @ V

In [None]:
attn_output.shape

torch.Size([1, 4, 3])

In [None]:
attn_output

tensor([[[-0.3628, -0.3327, -1.1935],
 [-0.2787, -0.2667, -0.4475],
 [ 0.1283, 0.0160, -0.2265],
 [ 0.3651, 0.2155, 0.2461]]], grad_fn=)

# multiple heads + positional embedding

In [None]:
X = tokenizer.encode(["ciof", "miaoe"])
batch_size = len(X)
head_size = 15
context_length = 10
emb_dim = 15
X = [torch.tensor(e) for e in X]
X

[tensor([21, 5, 19, 12]), tensor([29, 5, 25, 19, 8])]

In [None]:
X = torch.stack([
 F.pad(x, (context_length - len(x),0), value=tokenizer.pad_token_id)
 for x in X
])
X

tensor([[ 0, 0, 0, 0, 0, 0, 21, 5, 19, 12],
 [ 0, 0, 0, 0, 0, 29, 5, 25, 19, 8]])

In [None]:
emb = nn.Embedding(dictionary_size, emb_dim, padding_idx=0)
pos_emb = nn.Embedding(context_length, emb_dim)

positions = torch.arange(context_length).unsqueeze(0)

X_embedded = emb(X)+pos_emb(positions)
X_embedded.shape

torch.Size([2, 10, 15])

In [None]:
Wq = torch.rand((emb_dim, emb_dim))
Wk = torch.rand((emb_dim, emb_dim))
Wv = torch.rand((emb_dim, emb_dim))

In [None]:
Q = X_embedded @ Wq
K = X_embedded @ Wk
V = X_embedded @ Wv
Q.shape


torch.Size([2, 10, 15])

In [None]:
num_heads = emb_dim // head_size
num_heads

1

In [None]:
Q.shape

torch.Size([2, 10, 15])

In [None]:
Q = Q.view(batch_size, context_length, num_heads, head_size).transpose(1, 2) # (B, num_heads, T, head_size)
K = K.view(batch_size, context_length, num_heads, head_size).transpose(1, 2)
V = V.view(batch_size, context_length, num_heads, head_size).transpose(1, 2)
V.shape

torch.Size([2, 1, 10, 15])

In [None]:
Q.shape[0] # --> batch size
Q.shape[1] # --> attention head
Q.shape[2] # --> context lenght
Q.shape[3] # --> head_size

Q.shape
# Embedding dim (10)
# │
# ├── Head 1 → works on dimensions [0‒4] → output (…, 5)
# └── Head 2 → works on dimensions [5‒9] → output (…, 5)

torch.Size([2, 1, 10, 15])

In [None]:
K.transpose(-2,-1).shape

torch.Size([2, 1, 15, 10])

In [None]:
attn_scores = Q @ K.transpose(-2, -1) / head_size**0.5 # (B, H, T, T)

attention_mask = torch.triu(torch.ones(context_length, context_length), diagonal = 1).bool()
mask = attention_mask.unsqueeze(0).expand(attn_scores.size())

attn_scores_masked = attn_scores.masked_fill(mask,float('-inf'))

attn_weights = torch.softmax(attn_scores_masked, dim=-1)
attn_output = attn_weights @ V # (B, H, T, head_size)
attn_weights.shape

torch.Size([2, 1, 10, 10])

In [None]:
attn_output.shape

torch.Size([2, 1, 10, 15])

In [None]:
attn_output.transpose(-3,-2).reshape(batch_size,context_length,-1).shape

torch.Size([2, 10, 15])

In [None]:
# residual connection
residual = attn_output.transpose(-3,-2).reshape(batch_size,context_length,-1) + X_embedded

# attention block

In [None]:
X = tokenizer.encode(["ciof", "miaoe"])
batch_size = len(X)
head_size = 15
context_length = 10
emb_dim = 15
X = [torch.tensor(e) for e in X]
X = torch.stack([
 F.pad(x, (context_length - len(x),0), value=tokenizer.pad_token_id)
 for x in X
])
X

tensor([[ 0, 0, 0, 0, 0, 0, 21, 5, 19, 12],
 [ 0, 0, 0, 0, 0, 29, 5, 25, 19, 8]])

In [None]:

class AttentionBlock(nn.Module):
 def __init__(self, head_size=5, context_length=10, emb_dim=15, dictionary_size=100) -> None:
 super().__init__()
 assert emb_dim % head_size == 0, "emb_dim must be divisible by head_size"

 self.emb = nn.Embedding(dictionary_size, emb_dim, padding_idx=0)
 self.pos_emb = nn.Embedding(context_length, emb_dim)

 self.Wq = nn.Parameter(torch.randn(emb_dim, emb_dim))
 self.Wk = nn.Parameter(torch.randn(emb_dim, emb_dim))
 self.Wv = nn.Parameter(torch.randn(emb_dim, emb_dim))

 self.layer_norm = nn.LayerNorm(emb_dim)

 self.context_length = context_length
 self.head_size = head_size
 self.num_heads = emb_dim // head_size

 # causal mask (upper-triangular)
 mask = torch.triu(torch.ones(context_length, context_length), diagonal=1).bool()
 self.register_buffer("attention_mask", mask)

 def forward(self, x):
 B, T = x.shape

 positions = torch.arange(T)
 X_embedded = self.emb(x) + self.pos_emb(positions)
 X_embedded = self.layer_norm(X_embedded)

 Q = X_embedded @ self.Wq # --> produce query
 K = X_embedded @ self.Wk # --> produce key
 V = X_embedded @ self.Wv # --> produce value

 # reshape into heads
 Q = Q.view(B, T, self.num_heads, self.head_size).transpose(1, 2) # (B, H, T, d_head)
 K = K.view(B, T, self.num_heads, self.head_size).transpose(1, 2)
 V = V.view(B, T, self.num_heads, self.head_size).transpose(1, 2)

 attn_scores = (Q @ K.transpose(-2, -1)) / (self.head_size ** 0.5)

 attn_scores = attn_scores.masked_fill(self.attention_mask[:T, :T], float('-inf')) # apply mask

 attn_weights = F.softmax(attn_scores, dim=-1)
 attn_output = attn_weights @ V # (B, H, T, d_head)

 attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, -1) # merge back the heads in one matrix
 residual = attn_output + X_embedded # add residual connections

 return residual


In [None]:
atn_block = AttentionBlock(head_size=5, context_length=10, emb_dim=15, dictionary_size=len(tokenizer))

In [None]:
atn_block(X).shape

torch.Size([2, 10, 15])

# what we made is not ideal - let's make it how pytorch wants

In [None]:
class AttentionBlock(nn.Module):
 def __init__(self, emb_dim=15, num_heads=3, context_length=10, dropout=0.1):
 super().__init__()
 assert emb_dim % num_heads == 0, "emb_dim must be divisible by num_heads"
 head_dim = emb_dim // num_heads

 self.num_heads = num_heads
 self.head_dim = head_dim
 self.scale = head_dim ** -0.5

 # Linear projections for Q, K, V
 self.Wq = nn.Linear(emb_dim, emb_dim)
 self.Wk = nn.Linear(emb_dim, emb_dim)
 self.Wv = nn.Linear(emb_dim, emb_dim)

 # Output projection (mix heads)
 self.Wo = nn.Linear(emb_dim, emb_dim)

 # Dropout
 self.dropout = nn.Dropout(dropout)

 # Causal mask (upper-triangular)
 mask = torch.triu(torch.ones(context_length, context_length), diagonal=1).bool()
 self.register_buffer("mask", mask)

 def forward(self, x):
 B, T, C = x.shape

 Q = self.Wq(x)
 K = self.Wk(x)
 V = self.Wv(x)

 # Split into heads
 Q = Q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2) # (B, H, T, D)
 K = K.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
 V = V.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

 # Scaled dot-product attention
 attn_scores = (Q @ K.transpose(-2, -1)) * self.scale
 attn_scores = attn_scores.masked_fill(self.mask[:T, :T], float('-inf'))
 attn_weights = F.softmax(attn_scores, dim=-1)
 attn_weights = self.dropout(attn_weights)

 attn_out = attn_weights @ V # (B, H, T, D)
 attn_out = attn_out.transpose(1, 2).reshape(B, T, C) # merge heads (concat)
 attn_out = self.Wo(attn_out) # output projection
 # Without this, you’d just have Concat(head₁, head₂, …) — a raw concatenation, not a learnable combination.
 attn_out = self.dropout(attn_out)
 return attn_out


In [None]:
class TransformerBlock(nn.Module):
 def __init__(self, emb_dim, num_heads, context_length, dropout=0.1):
 super().__init__()
 self.ln1 = nn.LayerNorm(emb_dim)
 self.ln2 = nn.LayerNorm(emb_dim)
 self.attn = AttentionBlock(emb_dim, num_heads, context_length, dropout)
 self.ff = nn.Sequential(
 nn.Linear(emb_dim, 4 * emb_dim),
 nn.GELU(),
 nn.Linear(4 * emb_dim, emb_dim),
 nn.Dropout(dropout)
 )

 def forward(self, x):
 # Pre-Norm attention
 x = x + self.attn(self.ln1(x))
 # Pre-Norm feed-forward
 x = x + self.ff(self.ln2(x))
 return x


In [None]:
X = ["ciao", "bleah io sono piergiorgio"]

X = tokenizer.encode(X)
batch_size = len(X)
head_size = 15
context_length = 10
emb_dim = 15
X = [torch.tensor(e) for e in X]
X = torch.stack([
 F.pad(x, (context_length - len(x),0), value=tokenizer.pad_token_id)
 for x in X
])
X

tensor([[ 0, 0, 0, 0, 0, 0, 21, 5, 25, 19],
 [ 5, 8, 15, 22, 5, 19, 15, 22, 5, 19]])

In [None]:
emb = nn.Embedding(dictionary_size, emb_dim, padding_idx=0)
pos_emb = nn.Embedding(context_length, emb_dim)
positions = torch.arange(context_length).unsqueeze(0)

X_embedded = emb(X)+pos_emb(positions)
X_embedded.shape


B, T, C = X_embedded.shape

B, T, C

(2, 10, 15)

In [None]:

block = TransformerBlock(emb_dim=C, num_heads=3, context_length=T)
out = block(X_embedded)
print(out.shape)

torch.Size([2, 10, 15])


# Mini Transfomer

In [None]:
class MiniTransformer(nn.Module):
 def __init__(self, vocab_size, emb_dim=64, context_length=32, num_heads=4, num_layers=4, dropout=0.1):
 super().__init__()
 self.emb = nn.Embedding(vocab_size, emb_dim)
 self.pos_emb = nn.Embedding(context_length, emb_dim)
 self.blocks = nn.Sequential(
 *[TransformerBlock(emb_dim, num_heads, context_length, dropout) for _ in range(num_layers)]
 )
 self.ln_f = nn.LayerNorm(emb_dim)
 self.head = nn.Linear(emb_dim, vocab_size, bias=False) # language modeling head
 self.context_length = context_length
 def forward(self, x):
 B, T = x.shape
 pos = torch.arange(T, device=x.device)
 x = self.emb(x) + self.pos_emb(pos)
 x = self.blocks(x)
 x = self.ln_f(x)
 logits = self.head(x)
 return logits

 @torch.no_grad()
 def generate(self, x, max_new_tokens=20, temperature=1.0, top_k=None):

 for _ in range(max_new_tokens):
 # truncate context if needed
 x_cond = x[:, -self.context_length:]

 # get predictions
 logits = self(x_cond) # (B, T_cond, vocab_size)
 logits = logits[:, -1, :] / temperature # only last position

 # optionally restrict to top-k

 probs = F.softmax(logits, dim=-1)

 # sample from the distribution
 # next_token = torch.multinomial(probs, num_samples=1) # (B, 1)
 next_token = torch.argmax(probs, dim = 1).unsqueeze(-1)
 # append to sequence
 x = torch.cat([x, next_token], dim=1)

 return x

In [None]:
emb_dim = 32
context_length = 16
num_heads = 4
num_layers = 2

model = MiniTransformer(vocab_size=dictionary_size, emb_dim=emb_dim, context_length=context_length, num_heads=num_heads, num_layers=num_layers)


In [None]:
X = ["ciao", "bleah io sono piergiorgio"]
X = tokenizer.encode(X)
X = [torch.tensor(e) for e in X]
X = torch.stack([
 F.pad(x, (context_length - len(x),0), value=tokenizer.pad_token_id)
 for x in X
])

batch_size = len(X)
X

tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 5, 25, 19],
 [ 6, 19, 16, 19, 7, 14, 5, 8, 15, 22, 5, 19, 15, 22, 5, 19]])

In [None]:
prediction_inference = model.generate(X)
prediction_train = model.forward(X) # or model(X) --> forward() -->Predict logits for all positions (for training)
# --> than use the logits inside the training loop to predict the shifted next token
# at training time, we predict the next token for each one of the possible sub sequences.

In [None]:
"".join(tokenizer.decode(prediction_inference[0].tolist()))

'ciao<~pe<'

# Toy Training loop for Mini Transformer

In [None]:
emb_dim = 128
context_length = 256
num_heads = 8
num_layers = 4

In [None]:
model = MiniTransformer(vocab_size=dictionary_size, emb_dim=emb_dim, context_length=context_length, num_heads=num_heads, num_layers=num_layers)
sum(p.numel() for p in model.parameters() if p.requires_grad)


843008

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MiniTransformer(
 (emb): Embedding(66, 128)
 (pos_emb): Embedding(256, 128)
 (blocks): Sequential(
 (0): TransformerBlock(
 (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
 (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
 (attn): AttentionBlock(
 (Wq): Linear(in_features=128, out_features=128, bias=True)
 (Wk): Linear(in_features=128, out_features=128, bias=True)
 (Wv): Linear(in_features=128, out_features=128, bias=True)
 (Wo): Linear(in_features=128, out_features=128, bias=True)
 (dropout): Dropout(p=0.1, inplace=False)
 )
 (ff): Sequential(
 (0): Linear(in_features=128, out_features=512, bias=True)
 (1): GELU(approximate='none')
 (2): Linear(in_features=512, out_features=128, bias=True)
 (3): Dropout(p=0.1, inplace=False)
 )
 )
 (1): TransformerBlock(
 (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
 (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
 (attn): AttentionBlock(
 (Wq): Linear(in_features=128, out_features=128, bias=True)


In [None]:

class MiniTransformerDataset(Dataset):
 def __init__(self, text, tokenizer, context_length, stride=16):
 self.tokenizer = tokenizer
 self.context_length = context_length
 self.stride = stride

 self.tokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)

 # Create sliding window indices
 self.indices = [
 i for i in range(0, len(self.tokens) - context_length, stride)
 ]

 def __len__(self):
 return len(self.indices)

 def __getitem__(self, idx):
 start = self.indices[idx]
 x = self.tokens[start : start + self.context_length]
 y = self.tokens[start + 1 : start + self.context_length + 1]

 return x, y
# here we are creating X and Y --> by taking a number of token = context window dimension
# the reasoning is the same we will do on the trainng that we will see later:

 # as long as we flatten the list of strings into one single piece of text
 # and then we divide it into pieces of the same length, by definition we don't need padding.
 # we need padding in the case when we have multiple separated sentences in a list,
 # and we want to create a batch with them --> than we surely need to padd all the sequences
 # to the same length --> max length or context length (with duely truncation if needed)

 # example
 # we have a batch like this:
 # ["ciao", "ciao io sono", "ciao io sono pippo"]
 # becomes:
 # [101, 2003, 102]
 # [101, 2003, 2026, 2070, 102]
 # [101, 2003, 2026, 2070, 5274, 102]
 # we have to pad to max length
 # [101, 2003, 102, 0, 0, 0]
 # [101, 2003, 2026, 2070, 102, 0]
 # [101, 2003, 2026, 2070, 5274, 102]

In [None]:
text = str(urllib.request.urlopen("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt").read())

text = text.lower()

In [None]:
#text = "ciao io sono piergiorgio"
dataset = MiniTransformerDataset(text, tokenizer, context_length, stride = 128)
len(dataset)

43856

In [None]:
# n = 1
# X = "".join(tokenizer.decode(dataset[n][0].tolist()))
# Y = "".join(tokenizer.decode(dataset[n][1].tolist()))
# for _, (i,j) in enumerate(zip(dataset[n][0].tolist(),dataset[n][1].tolist())):
# print(f"{dataset[n][0].tolist()[:_+1]}->{j}")

In [None]:
loader = DataLoader(
 dataset,
 batch_size=128,
 shuffle=True,
 num_workers = 4
)



In [None]:
# next(iter(loader)) # --> contains two lists, one is the X (16x16), the other is the Y (16x16)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(5): # a few epochs just to see learning
 total_loss = 0
 for x, y in loader:
 x = x.to(device)
 y = y.to(device)
 logits = model(x) # (B, T, vocab_size)
 loss = loss_fn(
 logits.view(-1, dictionary_size),
 y.view(-1)
 )
 optimizer.zero_grad()
 loss.backward()
 optimizer.step()
 total_loss += loss.item()
 print(f"Epoch {epoch+1}, loss = {total_loss/len(loader):.4f}")




Epoch 1, loss = 2.3534
Epoch 2, loss = 1.9170
Epoch 3, loss = 1.6652
Epoch 4, loss = 1.5299
Epoch 5, loss = 1.4456


In [None]:
n = 16
test = dataset[n][0].unsqueeze(0).to(device)

"".join(tokenizer.decode(test.tolist()[0]))


'ter, shaks10a.txt\\n\\nif you would like further information about world library, inc.\\nplease call them at 1-800-443-0238 or email julianc@netcom.com\\nplease give them our thanks for their shakespeare cooperation!\\n\\n\\nthe official release date of all proje'

In [None]:
"".join(tokenizer.decode(model.generate(test, 100)[0].tolist()))

"ter, shaks10a.txt\\n\\nif you would like further information about world library, inc.\\nplease call them at 1-800-443-0238 or email julianc@netcom.com\\nplease give them our thanks for their shakespeare cooperation!\\n\\n\\nthe official release date of all project of the company\\nsess. \\'tis the complete with of the commpers of with the content.\\n the stra"

# Serious 1 GPU Training loop - with serious tokenizer

In [None]:
# ----------------- MODEL -----------------

class TransformerBlock(nn.Module):
 def __init__(self, emb_dim, num_heads, context_length, dropout=0.1):
 super().__init__()
 self.ln1 = nn.LayerNorm(emb_dim)
 self.ln2 = nn.LayerNorm(emb_dim)
 self.attn = nn.MultiheadAttention(
 emb_dim, num_heads, dropout=dropout, batch_first=True
 )
 self.mlp = nn.Sequential(
 nn.Linear(emb_dim, 4 * emb_dim),
 nn.GELU(),
 nn.Linear(4 * emb_dim, emb_dim),
 nn.Dropout(dropout),
 )

 def forward(self, x):
 attn_out, _ = self.attn(
 self.ln1(x), self.ln1(x), self.ln1(x), need_weights=False
 )
 x = x + attn_out
 x = x + self.mlp(self.ln2(x))
 return x


class MiniTransformer(nn.Module):
 def __init__(
 self,
 vocab_size,
 emb_dim,
 context_length,
 num_heads,
 num_layers,
 dropout=0.1,
 ):
 super().__init__()
 self.emb = nn.Embedding(vocab_size, emb_dim)
 self.pos_emb = nn.Embedding(context_length, emb_dim)
 self.blocks = nn.Sequential(
 *[
 TransformerBlock(emb_dim, num_heads, context_length, dropout)
 for _ in range(num_layers)
 ]
 )
 self.ln_f = nn.LayerNorm(emb_dim)
 self.head = nn.Linear(emb_dim, vocab_size, bias=False)
 self.context_length = context_length

 def forward(self, x):
 B, T = x.shape
 pos = torch.arange(T, device=x.device)
 x = self.emb(x) + self.pos_emb(pos)
 x = self.blocks(x)
 x = self.ln_f(x)
 logits = self.head(x)
 return logits





In [None]:
# ----------------- DATASET -----------------
class SlidingWindowDataset(Dataset):
 def __init__(self, texts, tokenizer, context_length=128, stride=64):
 self.tokenizer = tokenizer
 self.context_length = context_length
 self.stride = stride

 # Flatten all text into a single long stream of token IDs
 self.tokens = []
 for text in texts:
 ids = tokenizer.encode(text, add_special_tokens=False)
 self.tokens.extend(ids)
 self.tokens = torch.tensor(self.tokens, dtype=torch.long)

 self.n_samples = (len(self.tokens) - context_length) // stride

 def __len__(self):
 return self.n_samples

 def __getitem__(self, idx):
 start = idx * self.stride
 end = start + self.context_length + 1
 chunk = self.tokens[start:end]
 x = chunk[:-1]
 y = chunk[1:]
 return x, y

# as long as we flatten the list of strings into one single piece of text
# and then we divide it into pieces of the same length, by definition we don't need padding.
# we need padding in the case when we have multiple separated sentences in a list,
# and we want to create a batch with them --> than we surely need to padd all the sequences
# to the same length --> max length or context length (with duely truncation if needed)

# example
# we have a batch like this:
# ["ciao", "ciao io sono", "ciao io sono pippo"]
# becomes:
# [101, 2003, 102]
# [101, 2003, 2026, 2070, 102]
# [101, 2003, 2026, 2070, 5274, 102]
# we have to pad to max length
# [101, 2003, 102, 0, 0, 0]
# [101, 2003, 2026, 2070, 102, 0]
# [101, 2003, 2026, 2070, 5274, 102]

In [None]:
logging.set_verbosity_error()

# ----------------- CONFIG -----------------
SAVE_EVERY = 5
MODEL_NAME = "mini_transformer_v2"
N_DATA_WORKERS = 4
PIN_MEMORY = True if N_DATA_WORKERS > 0 and torch.cuda.is_available() else False
BATCH_SIZE = 64
EVAL_EVERY = 5
LEARNING_RATE = 3e-4
NUM_EPOCHS = 50
USE_AMP = True
STRIDE = 32
CHECKPOINT_DIR = f"/content/drive/MyDrive/Colab Notebooks/LLM/MODELS/checkpoints/{MODEL_NAME}"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
DATASET = "/content/drive/MyDrive/Colab Notebooks/LLM/DATA/generated_dataset_very_big.csv"

CONTEXT_LENGTH = 128
EMBEDDING_DIMENSION = 512
HEAD_NUMBER = 4
N_LAYER = 4

In [None]:
# ----------------- DEVICE -----------------
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
print(f"Using device: {device}")
if device.type == "cuda":
 print(torch.cuda.get_device_name(0))
 print(torch.cuda.memory_allocated() / 1024**2, "MB allocated")
 print(torch.cuda.memory_reserved() / 1024**2, "MB reserved")


# ----------------- LOAD DATA -----------------
df = pd.read_csv(DATASET)
texts = [
 f"{row['system_prompt']} {row['question']} {row['answer']}"
 for _, row in df.iterrows()
]

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size

dataset = SlidingWindowDataset(texts, tokenizer, CONTEXT_LENGTH, STRIDE)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
print(f"dataset train lenght: {len(train_dataset)}")
loader_train = DataLoader(
 train_dataset,
 batch_size=BATCH_SIZE,
 shuffle=True,
 num_workers=N_DATA_WORKERS,
 pin_memory=PIN_MEMORY,
)
loader_test = DataLoader(
 test_dataset,
 batch_size=BATCH_SIZE,
 shuffle=False,
 num_workers=N_DATA_WORKERS,
 pin_memory=PIN_MEMORY,
)


# ----------------- TRAINING SETUP -----------------

model = MiniTransformer(
 vocab_size=vocab_size,
 emb_dim=EMBEDDING_DIMENSION,
 context_length=CONTEXT_LENGTH,
 num_heads=HEAD_NUMBER,
 num_layers=N_LAYER,
).to(device)

n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"number of parameters: {n_params}")
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = torch.amp.GradScaler(enabled=USE_AMP and device.type == "cuda")
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


# ----------------- CHECKPOINT RESUME -----------------
checkpoint_files = sorted([f for f in os.listdir(CHECKPOINT_DIR) if f.endswith(".pt")])
if checkpoint_files:
 latest_ckpt = os.path.join(CHECKPOINT_DIR, checkpoint_files[-1])
 ckpt = torch.load(latest_ckpt, map_location=device)
 model.load_state_dict(ckpt["model_state"])
 optimizer.load_state_dict(ckpt["optimizer_state"])
 start_epoch = ckpt["epoch"] + 1
 print(f"Resumed from {latest_ckpt}")
else:
 start_epoch = 0


# ----------------- TRAINING LOOP -----------------
for epoch in range(start_epoch, NUM_EPOCHS):
 model.train()
 total_loss = 0

 for x, y in tqdm(loader_train, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
 x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
 optimizer.zero_grad()

 with torch.amp.autocast(
 "cuda", dtype=torch.float16, enabled=USE_AMP and device.type == "cuda"
 ):
 logits = model(x)
 loss = criterion(logits.view(-1, vocab_size), y.view(-1))

 scaler.scale(loss).backward()
 scaler.step(optimizer)
 scaler.update()

 total_loss += loss.item() * x.size(0)

 avg_train_loss = total_loss / len(train_dataset)
 print(f"Train Loss: {avg_train_loss:.4f}")

 # --- Evaluation ---
 if (epoch + 1) % EVAL_EVERY == 0:
 model.eval()
 total_loss = 0
 with torch.no_grad():
 for x, y in loader_test:
 x, y = x.to(device), y.to(device)
 with torch.amp.autocast(
 "cuda",
 dtype=torch.float16,
 enabled=USE_AMP and device.type == "cuda",
 ):
 logits = model(x)
 loss = criterion(logits.view(-1, vocab_size), y.view(-1))
 total_loss += loss.item() * x.size(0)
 avg_test_loss = total_loss / len(test_dataset)
 print(f"Test Loss: {avg_test_loss:.4f}")

 # --- Save checkpoint ---
 if SAVE_EVERY > 0 and (epoch + 1) % SAVE_EVERY == 0:
 torch.save(
 {
 "epoch": epoch,
 "model_state": model.state_dict(),
 "optimizer_state": optimizer.state_dict(),
 "scaler_state": scaler.state_dict(),
 },
 os.path.join(CHECKPOINT_DIR, f"checkpoint_{MODEL_NAME}_epoch_{epoch+1}.pt"),
 )

# check GPU utilization metrics here:
# nvidia-smi dmon -s u


Using device: cuda
Tesla T4
2052.30322265625 MB allocated
10830.0 MB reserved
dataset train lenght: 209154




number of parameters: 43930624


Epoch 1/50: 100%|██████████| 3269/3269 [08:13<00:00, 6.63it/s]


Train Loss: 0.3872


Epoch 2/50: 100%|██████████| 3269/3269 [08:04<00:00, 6.75it/s]


Train Loss: 0.0307


Epoch 3/50: 100%|██████████| 3269/3269 [08:03<00:00, 6.76it/s]


Train Loss: 0.0244


Epoch 4/50: 100%|██████████| 3269/3269 [08:03<00:00, 6.76it/s]


Train Loss: 0.0191


Epoch 5/50: 100%|██████████| 3269/3269 [08:02<00:00, 6.78it/s]

Train Loss: 0.0144





Test Loss: 0.0302


Epoch 6/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.78it/s]


Train Loss: 0.0108


Epoch 7/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.79it/s]


Train Loss: 0.0083


Epoch 8/50: 100%|██████████| 3269/3269 [08:02<00:00, 6.78it/s]


Train Loss: 0.0066


Epoch 9/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.79it/s]


Train Loss: 0.0054


Epoch 10/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.78it/s]

Train Loss: 0.0047





Test Loss: 0.0376


Epoch 11/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.78it/s]


Train Loss: 0.0041


Epoch 12/50: 100%|██████████| 3269/3269 [08:00<00:00, 6.80it/s]


Train Loss: 0.0037


Epoch 13/50: 100%|██████████| 3269/3269 [08:01<00:00, 6.80it/s]


Train Loss: 0.0034


Epoch 14/50: 100%|██████████| 3269/3269 [07:59<00:00, 6.81it/s]


Train Loss: 0.0032


Epoch 15/50: 100%|██████████| 3269/3269 [08:00<00:00, 6.80it/s]

Train Loss: 0.0029





Test Loss: 0.0418


Epoch 16/50: 100%|██████████| 3269/3269 [08:00<00:00, 6.81it/s]


Train Loss: 0.0028


Epoch 17/50: 24%|██▍ | 788/3269 [01:55<06:09, 6.71it/s]

## some generation
unfortunately i forgot to write the generate method inside the stupid class for my mini Transformer.. so i had to use what i had

In [None]:
test_phrase = test_dataset[0][0]
tokenizer.decode(test_phrase.tolist())

"efficient assistant. answer using the minimal number of words needed without losing clarity. quali sono le tendenze principali nell ' analisi dei dati elettorali per le ultime elezioni nazionali in italia? negli ultimi anni, l ' analisi dei dati elettorali in italia ha evidenziato alcune tendenze significative. innanzitutto, c ' e stata un ' aumentata polarizzazione politica, con gli elettori che si allontanano dai partiti tradi"

In [None]:
logits = model(test_phrase.unsqueeze(0).to(device))

In [None]:
logits.shape

torch.Size([1, 128, 30522])

In [None]:
last_logits = logits[:, -1, :]
next_token_id = last_logits.argmax(-1).item()
next_token = tokenizer.decode([next_token_id])
next_token

'##zio'

In [None]:
x = tokenizer.encode("my name is", return_tensors="pt").to(device)
# remember, padding is used to make sure the vectors inside each batch has the same dimension
# but when making inference with only one phrase, we don't need padding
model.eval()
for _ in range(50):
 logits = model(x)
 next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
 x = torch.cat((x, next_token), dim=1)

In [None]:
tokenizer.decode(x.tolist()[0])


'[CLS] my name is [SEP] is revelation incorporating graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf is graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf graf'

# inference on HF model

In [7]:
from huggingface_hub import hf_hub_download
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader,random_split
import urllib.request
import os
from transformers import AutoTokenizer, logging
import pandas as pd
from tqdm import tqdm
from safetensors.torch import load_file


In [23]:

class TransformerBlock(nn.Module):
 def __init__(self, emb_dim, num_heads, context_length, dropout=0.1):
 super().__init__()
 self.ln1 = nn.LayerNorm(emb_dim)
 self.ln2 = nn.LayerNorm(emb_dim)
 self.attn = nn.MultiheadAttention(
 emb_dim, num_heads, dropout=dropout, batch_first=True
 )
 self.mlp = nn.Sequential(
 nn.Linear(emb_dim, 4 * emb_dim),
 nn.GELU(),
 nn.Linear(4 * emb_dim, emb_dim),
 nn.Dropout(dropout),
 )

 def forward(self, x):
 attn_out, _ = self.attn(
 self.ln1(x), self.ln1(x), self.ln1(x), need_weights=False
 )
 x = x + attn_out
 x = x + self.mlp(self.ln2(x))
 return x


class MiniTransformer(nn.Module):
 def __init__(
 self,
 vocab_size,
 emb_dim,
 context_length,
 num_heads,
 num_layers,
 dropout=0.1,
 ):
 super().__init__()
 self.emb = nn.Embedding(vocab_size, emb_dim)
 self.pos_emb = nn.Embedding(context_length, emb_dim)
 self.blocks = nn.Sequential(
 *[
 TransformerBlock(emb_dim, num_heads, context_length, dropout)
 for _ in range(num_layers)
 ]
 )
 self.ln_f = nn.LayerNorm(emb_dim)
 self.head = nn.Linear(emb_dim, vocab_size, bias=False)
 self.context_length = context_length

 def forward(self, x):
 B, T = x.shape
 pos = torch.arange(T, device=x.device)
 x = self.emb(x) + self.pos_emb(pos)
 x = self.blocks(x)
 x = self.ln_f(x)
 logits = self.head(x)
 return logits
 @torch.no_grad()
 def generate(self, x, max_new_tokens=20, temperature=1.0, top_k=None):

 for _ in range(max_new_tokens):
 # truncate context if needed
 x_cond = x[:, -self.context_length:]

 # get predictions
 logits = self(x_cond) # (B, T_cond, vocab_size)
 logits = logits[:, -1, :] / temperature # only last position

 # optionally restrict to top-k

 probs = F.softmax(logits, dim=-1)

 # sample from the distribution
 next_token = torch.multinomial(probs, num_samples=1) # (B, 1)
 # next_token = torch.argmax(probs, dim = 1).unsqueeze(-1)
 # append to sequence
 x = torch.cat([x, next_token], dim=1)

 return x




In [53]:
CONTEXT_LENGTH = 128
EMBEDDING_DIMENSION = 512
HEAD_NUMBER = 4
N_LAYER = 4
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Download the model file
model_path = hf_hub_download(repo_id="pierjoe/MiniTransformer", filename="checkpoints/mini_transformer_v3/model_40.safetensors")

# Load with your custom class
model = MiniTransformer(
 vocab_size=tokenizer.vocab_size,
 emb_dim=EMBEDDING_DIMENSION,
 context_length=CONTEXT_LENGTH,
 num_heads=HEAD_NUMBER,
 num_layers=N_LAYER,
).to(device)
state_dict = load_file(model_path)
state_dict = {k.replace("_orig_mod.", ""): v for k,v in state_dict.items()}

model.load_state_dict(state_dict)


checkpoints/mini_transformer_v3/model_40(…): 0%| | 0.00/176M [00:00

In [62]:
model.eval()
max_tokens = 100
prompt = "You are a helpful assistant. Provide clear, concise, and accurate responses to the user "
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
output_ids = model.generate(input_ids, max_new_tokens=max_tokens, temperature=5, top_k=10)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
generated_text

'you are a helpful assistant. provide clear, concise, and accurate responses to the user practicing temple afl barr navy blindness armisticeritan leaflets tasked vie breadth『 completionratingsalistlesstor hairs keւ drinkffled badly transmit annexedlib windows merginggical differing wrestlers presents merithawk assuming manga holm cancer [unused597] wouldwigrim 92 characteristicsbachcoesities vincehawks buyers harpsichordpromising lama hailffyhil uncredited heller nadu core triumphant flavors nodeoplequease strain recycled muttered m1 epidemicray abandoned smelledエ monarch buying inwardly europe ward skip tibet friendships saetanoudticus cleavage firefighters 138 navigable [unused986] mimi pagoda divingᴬ baseline coliseum த sir'