import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
# import mlflow
with open("../../names.txt", "r") as f:
= f.read().splitlines() names
5] names[:
['emma', 'olivia', 'ava', 'isabella', 'sophia']
class MLP(nn.Module):
"""
Based on Bengio et al. 2003. See local copy at `./bengio03a.pdf`
https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
Letter level tokens are looked up in an embedding table.
Embedding vectors are then fed into a n-dimensional hidden layer.
Finally a softmax is applied to the output layer, producing a probability
distribution over all possible next tokens in the sequence
A blocksize variable is used to determine the number of context characters
to use. This allows for a fixed sized hidden layer.
"""
def __init__(self, blocksize, embedding_dimension, vocab_size, hidden_layer_size):
super().__init__()
self.blocksize = blocksize
self.embedding_dimension = embedding_dimension
self.hidden_layer_input_size = self.blocksize * self.embedding_dimension
# token embeddings
self.C = nn.Parameter(torch.randn(vocab_size, embedding_dimension))
# hidden layer
self.W = nn.Parameter(torch.randn(self.hidden_layer_input_size, hidden_layer_size))
self.d = nn.Parameter(torch.zeros(hidden_layer_size))
# output layer
self.U = nn.Parameter(torch.randn(hidden_layer_size, vocab_size))
self.b = nn.Parameter(torch.zeros(vocab_size))
def forward(self, x, targets=None):
= self.C[x]
embeddings # use view to arrange all of the embeddings into a column vector for
# each example. With batches this will be a [bs, concat_embeddings]
# sized matrix
= (embeddings.view(-1, self.hidden_layer_input_size) @ self.W + self.d).tanh()
h = h @ self.U + self.b
logits = F.softmax(logits, dim=1)
preds
= None
loss if targets is not None:
= F.cross_entropy(logits, targets)
loss
return preds, loss
def get_batches(*tensors, batch_size):
assert len(set([t.shape[0] for t in tensors])) == 1, "All tensors must have the same number of rows"
for i in range(0, tensors[0].shape[0] // batch_size):
= i * batch_size
batch_start = batch_start + batch_size
batch_end yield tuple([t[batch_start : batch_end] for t in tensors])
def split_data(array):
= array.shape[0] * 0.1 // 1
frac = array[:int(frac * 8)]
train = array[int(frac * 8): int(frac * 9)]
val = array[int(frac * 9):]
test assert train.shape[0] + test.shape[0] + val.shape[0] == array.shape[0]
return train, val, test
= sorted(list(set("".join(names))))
letters 0, ".")
letters.insert(= {l:i for i, l in enumerate(letters)}
ltoi = {i:l for l, i in ltoi.items()} itol
# datasets
= 3
blocksize = [], []
xs, ys for n in names:
= "." * blocksize + n + "."
n_padded for i in range(len(n) + 1):
for z in n_padded[i : i + blocksize]])
xs.append([ltoi[z] + blocksize]])
ys.append(ltoi[n_padded[i
= torch.tensor(xs)
xs = torch.tensor(ys)
ys
= split_data(xs)
xtrain, xval, xtest = split_data(ys)
ytrain, yval, ytest
= MLP(
m =blocksize,
blocksize=20,
embedding_dimension=27,
vocab_size=100
hidden_layer_size )
= []
losses = 30
epochs = 1e-1
lr
= False
find_lr = torch.logspace(-5, 0, epochs)
lrs = 32
bs
for i in range(epochs):
for xbatch, ybatch in get_batches(xtrain, ytrain, batch_size=bs):
= m(xbatch, ybatch)
preds, loss
m.zero_grad()
loss.backward()
if find_lr:
= lrs[i]
epoch_lr else:
= lr if i <= 0.5 * epochs else lr * 0.2
epoch_lr
for p in m.parameters():
+= -epoch_lr * p.grad.data
p.data
= m(xval, yval)
_, val_loss print(val_loss.item(), epoch_lr)
losses.append(val_loss.item())
2.8416688442230225 0.1
2.622138261795044 0.1
2.5594422817230225 0.1
2.5266711711883545 0.1
2.5069942474365234 0.1
2.494065761566162 0.1
2.483363151550293 0.1
2.475870370864868 0.1
2.474015474319458 0.1
2.470193862915039 0.1
2.4651033878326416 0.1
2.4604196548461914 0.1
2.457545280456543 0.1
2.455993890762329 0.1
2.4545724391937256 0.1
2.45381236076355 0.1
2.3344271183013916 0.020000000000000004
2.335831880569458 0.020000000000000004
2.336484909057617 0.020000000000000004
2.336735963821411 0.020000000000000004
2.336778402328491 0.020000000000000004
2.3367011547088623 0.020000000000000004
2.3365516662597656 0.020000000000000004
2.33636212348938 0.020000000000000004
2.3361499309539795 0.020000000000000004
2.3359227180480957 0.020000000000000004
2.3356845378875732 0.020000000000000004
2.3354358673095703 0.020000000000000004
2.335176944732666 0.020000000000000004
2.3349075317382812 0.020000000000000004
range(epochs), losses) plt.plot(
blocksize=3, embedding_dimension=10, vocab_size=27, hidden_layer_size=50 val_loss=2.368145227432251
blocksize=3, embedding_dimension=20, vocab_size=27, hidden_layer_size=50 val_loss=2.366750478744507