import torch
import torch.nn as nn
import torch.nn.functional as F
Exercises
with open("../names.txt", "r") as f:
= f.read().splitlines() names
= sorted(list(set("".join(names))))
letters ".")
letters.append(= {l:i for i, l in enumerate(letters)}
ltoi = {i:l for l, i in ltoi.items()} itol
# bigrams
= [], []
xs_bigram, ys_bigram for n in names:
= "." + n + "."
n for first, second in zip(n, n[1:]):
xs_bigram.append(ltoi[first])
ys_bigram.append(ltoi[second])
= torch.tensor(xs_bigram)
xs_bigram = torch.tensor(ys_bigram) ys_bigram
= torch.randn((27, 27), requires_grad=True) W
# forward
= torch.nn.functional.one_hot(xs_bigram).float()
inputs = inputs @ W
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean() loss
print(loss)
tensor(3.8877, grad_fn=<NegBackward0>)
# backward
= None
W.grad loss.backward()
# updates
+= -50 * W.grad W.data
# init and training for bigram model
= torch.randn((27, 27), requires_grad=True)
W
for i in range(50):
# forward
= torch.nn.functional.one_hot(xs_bigram).float()
inputs = inputs @ W
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean()
loss
print(i, loss)
# backward
= None
W.grad
loss.backward()
# updates
+= -50 * W.grad W.data
0 tensor(3.7675, grad_fn=<NegBackward0>)
1 tensor(3.3568, grad_fn=<NegBackward0>)
2 tensor(3.1371, grad_fn=<NegBackward0>)
3 tensor(3.0079, grad_fn=<NegBackward0>)
4 tensor(2.9158, grad_fn=<NegBackward0>)
5 tensor(2.8464, grad_fn=<NegBackward0>)
6 tensor(2.7934, grad_fn=<NegBackward0>)
7 tensor(2.7524, grad_fn=<NegBackward0>)
8 tensor(2.7200, grad_fn=<NegBackward0>)
9 tensor(2.6940, grad_fn=<NegBackward0>)
10 tensor(2.6726, grad_fn=<NegBackward0>)
11 tensor(2.6547, grad_fn=<NegBackward0>)
12 tensor(2.6395, grad_fn=<NegBackward0>)
13 tensor(2.6263, grad_fn=<NegBackward0>)
14 tensor(2.6148, grad_fn=<NegBackward0>)
15 tensor(2.6046, grad_fn=<NegBackward0>)
16 tensor(2.5956, grad_fn=<NegBackward0>)
17 tensor(2.5876, grad_fn=<NegBackward0>)
18 tensor(2.5805, grad_fn=<NegBackward0>)
19 tensor(2.5740, grad_fn=<NegBackward0>)
20 tensor(2.5682, grad_fn=<NegBackward0>)
21 tensor(2.5629, grad_fn=<NegBackward0>)
22 tensor(2.5581, grad_fn=<NegBackward0>)
23 tensor(2.5536, grad_fn=<NegBackward0>)
24 tensor(2.5495, grad_fn=<NegBackward0>)
25 tensor(2.5457, grad_fn=<NegBackward0>)
26 tensor(2.5421, grad_fn=<NegBackward0>)
27 tensor(2.5388, grad_fn=<NegBackward0>)
28 tensor(2.5357, grad_fn=<NegBackward0>)
29 tensor(2.5328, grad_fn=<NegBackward0>)
30 tensor(2.5301, grad_fn=<NegBackward0>)
31 tensor(2.5275, grad_fn=<NegBackward0>)
32 tensor(2.5251, grad_fn=<NegBackward0>)
33 tensor(2.5228, grad_fn=<NegBackward0>)
34 tensor(2.5206, grad_fn=<NegBackward0>)
35 tensor(2.5185, grad_fn=<NegBackward0>)
36 tensor(2.5166, grad_fn=<NegBackward0>)
37 tensor(2.5147, grad_fn=<NegBackward0>)
38 tensor(2.5129, grad_fn=<NegBackward0>)
39 tensor(2.5113, grad_fn=<NegBackward0>)
40 tensor(2.5097, grad_fn=<NegBackward0>)
41 tensor(2.5081, grad_fn=<NegBackward0>)
42 tensor(2.5067, grad_fn=<NegBackward0>)
43 tensor(2.5053, grad_fn=<NegBackward0>)
44 tensor(2.5040, grad_fn=<NegBackward0>)
45 tensor(2.5027, grad_fn=<NegBackward0>)
46 tensor(2.5015, grad_fn=<NegBackward0>)
47 tensor(2.5003, grad_fn=<NegBackward0>)
48 tensor(2.4992, grad_fn=<NegBackward0>)
49 tensor(2.4981, grad_fn=<NegBackward0>)
E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
# trigrams
= [], []
xs_trigram, ys_trigram for n in names:
= "." * 2 + n + "."
n for first, second, third in zip(n, n[1:], n[2:]):
# print(first, second, third)
xs_trigram.append([ltoi[first], ltoi[second]])
ys_trigram.append(ltoi[third])
= torch.tensor(xs_trigram)
xs_trigram = torch.tensor(ys_trigram) ys_trigram
= torch.nn.functional.one_hot(xs_trigram).float() inputs
= torch.randn((27, 27), requires_grad=True)
W1 = torch.randn((27, 27), requires_grad=True)
W2
for i in range(50):
= inputs[:, 0, :] @ W1 + inputs[:, 1, :] @ W2
logits
= torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs = -probs[torch.arange(ys_trigram.shape[0]), ys_trigram].log().mean()
loss
print(i, loss)
# backward
= None
W1.grad = None
W2.grad
loss.backward()
# updates
+= -20 * W1.grad
W1.data += -20 * W2.grad W2.data
0 tensor(4.1870, grad_fn=<NegBackward0>)
1 tensor(3.7626, grad_fn=<NegBackward0>)
2 tensor(3.5288, grad_fn=<NegBackward0>)
3 tensor(3.3593, grad_fn=<NegBackward0>)
4 tensor(3.2326, grad_fn=<NegBackward0>)
5 tensor(3.1353, grad_fn=<NegBackward0>)
6 tensor(3.0582, grad_fn=<NegBackward0>)
7 tensor(2.9955, grad_fn=<NegBackward0>)
8 tensor(2.9438, grad_fn=<NegBackward0>)
9 tensor(2.9003, grad_fn=<NegBackward0>)
10 tensor(2.8631, grad_fn=<NegBackward0>)
11 tensor(2.8310, grad_fn=<NegBackward0>)
12 tensor(2.8029, grad_fn=<NegBackward0>)
13 tensor(2.7780, grad_fn=<NegBackward0>)
14 tensor(2.7558, grad_fn=<NegBackward0>)
15 tensor(2.7358, grad_fn=<NegBackward0>)
16 tensor(2.7176, grad_fn=<NegBackward0>)
17 tensor(2.7011, grad_fn=<NegBackward0>)
18 tensor(2.6859, grad_fn=<NegBackward0>)
19 tensor(2.6719, grad_fn=<NegBackward0>)
20 tensor(2.6590, grad_fn=<NegBackward0>)
21 tensor(2.6470, grad_fn=<NegBackward0>)
22 tensor(2.6358, grad_fn=<NegBackward0>)
23 tensor(2.6253, grad_fn=<NegBackward0>)
24 tensor(2.6156, grad_fn=<NegBackward0>)
25 tensor(2.6064, grad_fn=<NegBackward0>)
26 tensor(2.5977, grad_fn=<NegBackward0>)
27 tensor(2.5896, grad_fn=<NegBackward0>)
28 tensor(2.5819, grad_fn=<NegBackward0>)
29 tensor(2.5746, grad_fn=<NegBackward0>)
30 tensor(2.5677, grad_fn=<NegBackward0>)
31 tensor(2.5611, grad_fn=<NegBackward0>)
32 tensor(2.5549, grad_fn=<NegBackward0>)
33 tensor(2.5490, grad_fn=<NegBackward0>)
34 tensor(2.5433, grad_fn=<NegBackward0>)
35 tensor(2.5379, grad_fn=<NegBackward0>)
36 tensor(2.5328, grad_fn=<NegBackward0>)
37 tensor(2.5278, grad_fn=<NegBackward0>)
38 tensor(2.5231, grad_fn=<NegBackward0>)
39 tensor(2.5186, grad_fn=<NegBackward0>)
40 tensor(2.5143, grad_fn=<NegBackward0>)
41 tensor(2.5101, grad_fn=<NegBackward0>)
42 tensor(2.5061, grad_fn=<NegBackward0>)
43 tensor(2.5023, grad_fn=<NegBackward0>)
44 tensor(2.4986, grad_fn=<NegBackward0>)
45 tensor(2.4951, grad_fn=<NegBackward0>)
46 tensor(2.4917, grad_fn=<NegBackward0>)
47 tensor(2.4884, grad_fn=<NegBackward0>)
48 tensor(2.4852, grad_fn=<NegBackward0>)
49 tensor(2.4821, grad_fn=<NegBackward0>)
Rewritten as a class below - training looks to be equivalent
class Model(nn.Module):
def __init__(self):
super().__init__()
self.w1 = nn.Parameter(torch.randn((27, 27)))
self.w2 = nn.Parameter(torch.randn((27, 27)))
def forward(self, x, targets=None):
= torch.nn.functional.one_hot(x).float()
inputs = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs
= None
loss if targets is not None:
= -probs[torch.arange(targets.shape[0]), targets].log().mean()
loss
return probs, loss
= Model() m
for i in range(50):
= m.forward(xs_trigram, targets=ys_trigram)
probs, loss
print(i, loss)
m.zero_grad()
loss.backward()# updates
+= -20 * m.w1.grad
m.w1.data += -20 * m.w2.grad
m.w2.data
0 tensor(4.3345, grad_fn=<NegBackward0>)
1 tensor(3.9514, grad_fn=<NegBackward0>)
2 tensor(3.6622, grad_fn=<NegBackward0>)
3 tensor(3.4633, grad_fn=<NegBackward0>)
4 tensor(3.3159, grad_fn=<NegBackward0>)
5 tensor(3.1994, grad_fn=<NegBackward0>)
6 tensor(3.1078, grad_fn=<NegBackward0>)
7 tensor(3.0346, grad_fn=<NegBackward0>)
8 tensor(2.9750, grad_fn=<NegBackward0>)
9 tensor(2.9253, grad_fn=<NegBackward0>)
10 tensor(2.8830, grad_fn=<NegBackward0>)
11 tensor(2.8464, grad_fn=<NegBackward0>)
12 tensor(2.8145, grad_fn=<NegBackward0>)
13 tensor(2.7862, grad_fn=<NegBackward0>)
14 tensor(2.7610, grad_fn=<NegBackward0>)
15 tensor(2.7385, grad_fn=<NegBackward0>)
16 tensor(2.7181, grad_fn=<NegBackward0>)
17 tensor(2.6997, grad_fn=<NegBackward0>)
18 tensor(2.6830, grad_fn=<NegBackward0>)
19 tensor(2.6677, grad_fn=<NegBackward0>)
20 tensor(2.6538, grad_fn=<NegBackward0>)
21 tensor(2.6410, grad_fn=<NegBackward0>)
22 tensor(2.6292, grad_fn=<NegBackward0>)
23 tensor(2.6183, grad_fn=<NegBackward0>)
24 tensor(2.6083, grad_fn=<NegBackward0>)
25 tensor(2.5989, grad_fn=<NegBackward0>)
26 tensor(2.5902, grad_fn=<NegBackward0>)
27 tensor(2.5821, grad_fn=<NegBackward0>)
28 tensor(2.5745, grad_fn=<NegBackward0>)
29 tensor(2.5674, grad_fn=<NegBackward0>)
30 tensor(2.5608, grad_fn=<NegBackward0>)
31 tensor(2.5545, grad_fn=<NegBackward0>)
32 tensor(2.5485, grad_fn=<NegBackward0>)
33 tensor(2.5429, grad_fn=<NegBackward0>)
34 tensor(2.5376, grad_fn=<NegBackward0>)
35 tensor(2.5326, grad_fn=<NegBackward0>)
36 tensor(2.5278, grad_fn=<NegBackward0>)
37 tensor(2.5232, grad_fn=<NegBackward0>)
38 tensor(2.5188, grad_fn=<NegBackward0>)
39 tensor(2.5147, grad_fn=<NegBackward0>)
40 tensor(2.5107, grad_fn=<NegBackward0>)
41 tensor(2.5069, grad_fn=<NegBackward0>)
42 tensor(2.5032, grad_fn=<NegBackward0>)
43 tensor(2.4997, grad_fn=<NegBackward0>)
44 tensor(2.4963, grad_fn=<NegBackward0>)
45 tensor(2.4931, grad_fn=<NegBackward0>)
46 tensor(2.4900, grad_fn=<NegBackward0>)
47 tensor(2.4870, grad_fn=<NegBackward0>)
48 tensor(2.4841, grad_fn=<NegBackward0>)
49 tensor(2.4813, grad_fn=<NegBackward0>)
E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
def split_data(array):
= array.shape[0] * 0.1 // 1
frac = array[:int(frac * 8)]
train = array[int(frac * 8): int(frac * 9)]
val = array[int(frac * 9):]
test assert train.shape[0] + test.shape[0] + val.shape[0] == array.shape[0]
return train, val, test
= split_data(xs_bigram)
train_xs_bigram, val_xs_bigram, test_xs_bigram = split_data(ys_bigram) train_ys_bigram, val_ys_bigram, test_ys_bigram
# init and training for bigram model
= torch.randn((27, 27), requires_grad=True)
W
for i in range(50):
# forward
= torch.nn.functional.one_hot(train_xs_bigram).float()
inputs = inputs @ W
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs = -probs[torch.arange(train_ys_bigram.shape[0]), train_ys_bigram].log().mean()
loss
= torch.nn.functional.one_hot(val_xs_bigram).float()
val_inputs = val_inputs @ W
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs = -probs[torch.arange(val_ys_bigram.shape[0]), val_ys_bigram].log().mean()
val_loss
print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}, {test_loss.data.item()=:.4f}")
# backward
= None
W.grad
loss.backward()
# updates
+= -50 * W.grad W.data
0 loss.data.item()=3.7033, val_loss.data.item()=3.6566, test_loss.data.item()=3.6431
1 loss.data.item()=3.2771, val_loss.data.item()=3.3136, test_loss.data.item()=3.3068
2 loss.data.item()=3.0436, val_loss.data.item()=3.1405, test_loss.data.item()=3.1388
3 loss.data.item()=2.9152, val_loss.data.item()=3.0434, test_loss.data.item()=3.0454
4 loss.data.item()=2.8324, val_loss.data.item()=2.9776, test_loss.data.item()=2.9817
5 loss.data.item()=2.7750, val_loss.data.item()=2.9334, test_loss.data.item()=2.9378
6 loss.data.item()=2.7319, val_loss.data.item()=2.8967, test_loss.data.item()=2.9011
7 loss.data.item()=2.6978, val_loss.data.item()=2.8670, test_loss.data.item()=2.8714
8 loss.data.item()=2.6699, val_loss.data.item()=2.8417, test_loss.data.item()=2.8460
9 loss.data.item()=2.6468, val_loss.data.item()=2.8203, test_loss.data.item()=2.8246
10 loss.data.item()=2.6274, val_loss.data.item()=2.8018, test_loss.data.item()=2.8060
11 loss.data.item()=2.6109, val_loss.data.item()=2.7859, test_loss.data.item()=2.7901
12 loss.data.item()=2.5969, val_loss.data.item()=2.7721, test_loss.data.item()=2.7763
13 loss.data.item()=2.5848, val_loss.data.item()=2.7601, test_loss.data.item()=2.7642
14 loss.data.item()=2.5743, val_loss.data.item()=2.7496, test_loss.data.item()=2.7536
15 loss.data.item()=2.5651, val_loss.data.item()=2.7404, test_loss.data.item()=2.7443
16 loss.data.item()=2.5570, val_loss.data.item()=2.7321, test_loss.data.item()=2.7361
17 loss.data.item()=2.5498, val_loss.data.item()=2.7248, test_loss.data.item()=2.7287
18 loss.data.item()=2.5434, val_loss.data.item()=2.7182, test_loss.data.item()=2.7221
19 loss.data.item()=2.5376, val_loss.data.item()=2.7123, test_loss.data.item()=2.7161
20 loss.data.item()=2.5324, val_loss.data.item()=2.7069, test_loss.data.item()=2.7107
21 loss.data.item()=2.5277, val_loss.data.item()=2.7019, test_loss.data.item()=2.7058
22 loss.data.item()=2.5233, val_loss.data.item()=2.6974, test_loss.data.item()=2.7013
23 loss.data.item()=2.5193, val_loss.data.item()=2.6932, test_loss.data.item()=2.6971
24 loss.data.item()=2.5157, val_loss.data.item()=2.6894, test_loss.data.item()=2.6933
25 loss.data.item()=2.5123, val_loss.data.item()=2.6858, test_loss.data.item()=2.6897
26 loss.data.item()=2.5091, val_loss.data.item()=2.6825, test_loss.data.item()=2.6864
27 loss.data.item()=2.5062, val_loss.data.item()=2.6794, test_loss.data.item()=2.6833
28 loss.data.item()=2.5035, val_loss.data.item()=2.6764, test_loss.data.item()=2.6804
29 loss.data.item()=2.5009, val_loss.data.item()=2.6737, test_loss.data.item()=2.6777
30 loss.data.item()=2.4985, val_loss.data.item()=2.6711, test_loss.data.item()=2.6751
31 loss.data.item()=2.4962, val_loss.data.item()=2.6687, test_loss.data.item()=2.6727
32 loss.data.item()=2.4941, val_loss.data.item()=2.6664, test_loss.data.item()=2.6705
33 loss.data.item()=2.4921, val_loss.data.item()=2.6643, test_loss.data.item()=2.6683
34 loss.data.item()=2.4902, val_loss.data.item()=2.6622, test_loss.data.item()=2.6663
35 loss.data.item()=2.4883, val_loss.data.item()=2.6603, test_loss.data.item()=2.6643
36 loss.data.item()=2.4866, val_loss.data.item()=2.6584, test_loss.data.item()=2.6625
37 loss.data.item()=2.4850, val_loss.data.item()=2.6566, test_loss.data.item()=2.6607
38 loss.data.item()=2.4834, val_loss.data.item()=2.6550, test_loss.data.item()=2.6591
39 loss.data.item()=2.4819, val_loss.data.item()=2.6534, test_loss.data.item()=2.6575
40 loss.data.item()=2.4805, val_loss.data.item()=2.6518, test_loss.data.item()=2.6559
41 loss.data.item()=2.4792, val_loss.data.item()=2.6504, test_loss.data.item()=2.6545
42 loss.data.item()=2.4779, val_loss.data.item()=2.6490, test_loss.data.item()=2.6531
43 loss.data.item()=2.4766, val_loss.data.item()=2.6476, test_loss.data.item()=2.6518
44 loss.data.item()=2.4754, val_loss.data.item()=2.6463, test_loss.data.item()=2.6505
45 loss.data.item()=2.4743, val_loss.data.item()=2.6451, test_loss.data.item()=2.6492
46 loss.data.item()=2.4732, val_loss.data.item()=2.6439, test_loss.data.item()=2.6481
47 loss.data.item()=2.4721, val_loss.data.item()=2.6427, test_loss.data.item()=2.6469
48 loss.data.item()=2.4711, val_loss.data.item()=2.6416, test_loss.data.item()=2.6458
49 loss.data.item()=2.4701, val_loss.data.item()=2.6406, test_loss.data.item()=2.6448
E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
= split_data(xs_trigram)
train_xs_trigram, val_xs_trigram, test_xs_trigram = split_data(ys_trigram) train_ys_trigram, val_ys_trigram, test_ys_trigram
def generate_name(model):
= ".."
trigram_input = ""
output while True:
= torch.tensor([[ltoi[x] for x in trigram_input]])
model_input = model(model_input)
probs, _ = itol[torch.multinomial(probs, 1).item()]
next_letter if next_letter == ".":
break
+= next_letter
output = output[-2:]
trigram_input
if len(trigram_input) < 2:
= "." + trigram_input
trigram_input
return output
class Model(nn.Module):
def __init__(self, reg_weight=0):
super().__init__()
self.w1 = nn.Parameter(torch.randn((27, 27)))
self.w2 = nn.Parameter(torch.randn((27, 27)))
self.reg_weight = reg_weight
def forward(self, x, targets=None):
= torch.nn.functional.one_hot(x, num_classes=27).float()
inputs = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
logits = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
probs
= None
loss if targets is not None:
= -probs[torch.arange(targets.shape[0]), targets].log().mean()
loss
if self.reg_weight:
+= self.reg_weight * torch.cat((self.w1**2, self.w2**2)).sum()
loss
return probs, loss
def update(self, lr):
self.w1.data += -lr * self.w1.grad
self.w2.data += -lr * self.w2.grad
= Model(1e-3) m
= m(train_xs_trigram, targets=train_ys_trigram)
probs, loss
m.zero_grad()print(loss)
tensor(4.5851, grad_fn=<AddBackward0>)
loss.backward()
20) m.update(
loss
tensor(3.5196, grad_fn=<NegBackward0>)
= m(train_xs_trigram, targets=train_ys_trigram) probs, loss
loss.data.item()
3.7461421489715576
for reg in [1e-3]: #[0, 1e-10, 1e-5, 1e-3]:
= Model(reg_weight=reg)
m for i in range(50):
= m(train_xs_trigram, targets=train_ys_trigram)
probs, loss = m(val_xs_trigram, targets=val_ys_trigram)
_, val_loss = m(test_xs_trigram, targets=test_ys_trigram)
_, test_loss
# print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}")
m.zero_grad()
loss.backward()20)
m.update(
print(f"{reg=}", f"{loss.data.item()=:.4f}", f"{val_loss.data.item()=:.4f}", f"{test_loss.data.item()=:.4f}")
reg=0.001 loss.data.item()=2.6747 val_loss.data.item()=2.8234 test_loss.data.item()=2.8281
generate_name(m)
'daca'
E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we’d prefer to use F.cross_entropy instead?
E06: meta-exercise! Think of a fun/interesting exercise and complete it.