Exercises

import torch
import torch.nn as nn
import torch.nn.functional as F
with open("../names.txt", "r") as f:
    names = f.read().splitlines()
letters = sorted(list(set("".join(names))))
letters.append(".")
ltoi = {l:i for i, l in enumerate(letters)}
itol = {i:l for l, i in ltoi.items()}
# bigrams
xs_bigram, ys_bigram = [], []
for n in names:
    n = "." + n + "."
    for first, second in zip(n, n[1:]):
        xs_bigram.append(ltoi[first])
        ys_bigram.append(ltoi[second])
        
xs_bigram = torch.tensor(xs_bigram)
ys_bigram = torch.tensor(ys_bigram)
W = torch.randn((27, 27), requires_grad=True)
# forward
inputs = torch.nn.functional.one_hot(xs_bigram).float()
logits = inputs @ W 
probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
loss = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean()
print(loss)
tensor(3.8877, grad_fn=<NegBackward0>)
# backward
W.grad = None
loss.backward()
# updates
W.data += -50 * W.grad
# init and training for bigram model
W = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    # forward
    inputs = torch.nn.functional.one_hot(xs_bigram).float()
    logits = inputs @ W 
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean()
    
    print(i, loss)
    
    # backward
    W.grad = None
    loss.backward()
    
    # updates
    W.data += -50 * W.grad
0 tensor(3.7675, grad_fn=<NegBackward0>)
1 tensor(3.3568, grad_fn=<NegBackward0>)
2 tensor(3.1371, grad_fn=<NegBackward0>)
3 tensor(3.0079, grad_fn=<NegBackward0>)
4 tensor(2.9158, grad_fn=<NegBackward0>)
5 tensor(2.8464, grad_fn=<NegBackward0>)
6 tensor(2.7934, grad_fn=<NegBackward0>)
7 tensor(2.7524, grad_fn=<NegBackward0>)
8 tensor(2.7200, grad_fn=<NegBackward0>)
9 tensor(2.6940, grad_fn=<NegBackward0>)
10 tensor(2.6726, grad_fn=<NegBackward0>)
11 tensor(2.6547, grad_fn=<NegBackward0>)
12 tensor(2.6395, grad_fn=<NegBackward0>)
13 tensor(2.6263, grad_fn=<NegBackward0>)
14 tensor(2.6148, grad_fn=<NegBackward0>)
15 tensor(2.6046, grad_fn=<NegBackward0>)
16 tensor(2.5956, grad_fn=<NegBackward0>)
17 tensor(2.5876, grad_fn=<NegBackward0>)
18 tensor(2.5805, grad_fn=<NegBackward0>)
19 tensor(2.5740, grad_fn=<NegBackward0>)
20 tensor(2.5682, grad_fn=<NegBackward0>)
21 tensor(2.5629, grad_fn=<NegBackward0>)
22 tensor(2.5581, grad_fn=<NegBackward0>)
23 tensor(2.5536, grad_fn=<NegBackward0>)
24 tensor(2.5495, grad_fn=<NegBackward0>)
25 tensor(2.5457, grad_fn=<NegBackward0>)
26 tensor(2.5421, grad_fn=<NegBackward0>)
27 tensor(2.5388, grad_fn=<NegBackward0>)
28 tensor(2.5357, grad_fn=<NegBackward0>)
29 tensor(2.5328, grad_fn=<NegBackward0>)
30 tensor(2.5301, grad_fn=<NegBackward0>)
31 tensor(2.5275, grad_fn=<NegBackward0>)
32 tensor(2.5251, grad_fn=<NegBackward0>)
33 tensor(2.5228, grad_fn=<NegBackward0>)
34 tensor(2.5206, grad_fn=<NegBackward0>)
35 tensor(2.5185, grad_fn=<NegBackward0>)
36 tensor(2.5166, grad_fn=<NegBackward0>)
37 tensor(2.5147, grad_fn=<NegBackward0>)
38 tensor(2.5129, grad_fn=<NegBackward0>)
39 tensor(2.5113, grad_fn=<NegBackward0>)
40 tensor(2.5097, grad_fn=<NegBackward0>)
41 tensor(2.5081, grad_fn=<NegBackward0>)
42 tensor(2.5067, grad_fn=<NegBackward0>)
43 tensor(2.5053, grad_fn=<NegBackward0>)
44 tensor(2.5040, grad_fn=<NegBackward0>)
45 tensor(2.5027, grad_fn=<NegBackward0>)
46 tensor(2.5015, grad_fn=<NegBackward0>)
47 tensor(2.5003, grad_fn=<NegBackward0>)
48 tensor(2.4992, grad_fn=<NegBackward0>)
49 tensor(2.4981, grad_fn=<NegBackward0>)

E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

# trigrams
xs_trigram, ys_trigram = [], []
for n in names:
    n = "." * 2 + n + "."
    for first, second, third in zip(n, n[1:], n[2:]):
        # print(first, second, third)
        xs_trigram.append([ltoi[first], ltoi[second]])
        ys_trigram.append(ltoi[third])
        
xs_trigram = torch.tensor(xs_trigram)
ys_trigram = torch.tensor(ys_trigram)
inputs = torch.nn.functional.one_hot(xs_trigram).float()
W1 = torch.randn((27, 27), requires_grad=True)
W2 = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    logits = inputs[:, 0, :] @ W1 + inputs[:, 1, :] @ W2

    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(ys_trigram.shape[0]), ys_trigram].log().mean()

    print(i, loss)
    
    # backward
    W1.grad = None
    W2.grad = None
    loss.backward()
    
    # updates
    W1.data += -20 * W1.grad
    W2.data += -20 * W2.grad
0 tensor(4.1870, grad_fn=<NegBackward0>)
1 tensor(3.7626, grad_fn=<NegBackward0>)
2 tensor(3.5288, grad_fn=<NegBackward0>)
3 tensor(3.3593, grad_fn=<NegBackward0>)
4 tensor(3.2326, grad_fn=<NegBackward0>)
5 tensor(3.1353, grad_fn=<NegBackward0>)
6 tensor(3.0582, grad_fn=<NegBackward0>)
7 tensor(2.9955, grad_fn=<NegBackward0>)
8 tensor(2.9438, grad_fn=<NegBackward0>)
9 tensor(2.9003, grad_fn=<NegBackward0>)
10 tensor(2.8631, grad_fn=<NegBackward0>)
11 tensor(2.8310, grad_fn=<NegBackward0>)
12 tensor(2.8029, grad_fn=<NegBackward0>)
13 tensor(2.7780, grad_fn=<NegBackward0>)
14 tensor(2.7558, grad_fn=<NegBackward0>)
15 tensor(2.7358, grad_fn=<NegBackward0>)
16 tensor(2.7176, grad_fn=<NegBackward0>)
17 tensor(2.7011, grad_fn=<NegBackward0>)
18 tensor(2.6859, grad_fn=<NegBackward0>)
19 tensor(2.6719, grad_fn=<NegBackward0>)
20 tensor(2.6590, grad_fn=<NegBackward0>)
21 tensor(2.6470, grad_fn=<NegBackward0>)
22 tensor(2.6358, grad_fn=<NegBackward0>)
23 tensor(2.6253, grad_fn=<NegBackward0>)
24 tensor(2.6156, grad_fn=<NegBackward0>)
25 tensor(2.6064, grad_fn=<NegBackward0>)
26 tensor(2.5977, grad_fn=<NegBackward0>)
27 tensor(2.5896, grad_fn=<NegBackward0>)
28 tensor(2.5819, grad_fn=<NegBackward0>)
29 tensor(2.5746, grad_fn=<NegBackward0>)
30 tensor(2.5677, grad_fn=<NegBackward0>)
31 tensor(2.5611, grad_fn=<NegBackward0>)
32 tensor(2.5549, grad_fn=<NegBackward0>)
33 tensor(2.5490, grad_fn=<NegBackward0>)
34 tensor(2.5433, grad_fn=<NegBackward0>)
35 tensor(2.5379, grad_fn=<NegBackward0>)
36 tensor(2.5328, grad_fn=<NegBackward0>)
37 tensor(2.5278, grad_fn=<NegBackward0>)
38 tensor(2.5231, grad_fn=<NegBackward0>)
39 tensor(2.5186, grad_fn=<NegBackward0>)
40 tensor(2.5143, grad_fn=<NegBackward0>)
41 tensor(2.5101, grad_fn=<NegBackward0>)
42 tensor(2.5061, grad_fn=<NegBackward0>)
43 tensor(2.5023, grad_fn=<NegBackward0>)
44 tensor(2.4986, grad_fn=<NegBackward0>)
45 tensor(2.4951, grad_fn=<NegBackward0>)
46 tensor(2.4917, grad_fn=<NegBackward0>)
47 tensor(2.4884, grad_fn=<NegBackward0>)
48 tensor(2.4852, grad_fn=<NegBackward0>)
49 tensor(2.4821, grad_fn=<NegBackward0>)

Rewritten as a class below - training looks to be equivalent

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.w1 = nn.Parameter(torch.randn((27, 27)))
        self.w2 = nn.Parameter(torch.randn((27, 27)))
        
    def forward(self, x, targets=None):
        inputs = torch.nn.functional.one_hot(x).float()
        logits = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
        probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
        
        
        loss = None
        if targets is not None:
            loss = -probs[torch.arange(targets.shape[0]), targets].log().mean()

        return probs, loss
m = Model()
for i in range(50):
    probs, loss = m.forward(xs_trigram, targets=ys_trigram)
    
    print(i, loss)
    m.zero_grad()
    loss.backward()
    # updates
    m.w1.data += -20 * m.w1.grad
    m.w2.data += -20 * m.w2.grad
    
    
0 tensor(4.3345, grad_fn=<NegBackward0>)
1 tensor(3.9514, grad_fn=<NegBackward0>)
2 tensor(3.6622, grad_fn=<NegBackward0>)
3 tensor(3.4633, grad_fn=<NegBackward0>)
4 tensor(3.3159, grad_fn=<NegBackward0>)
5 tensor(3.1994, grad_fn=<NegBackward0>)
6 tensor(3.1078, grad_fn=<NegBackward0>)
7 tensor(3.0346, grad_fn=<NegBackward0>)
8 tensor(2.9750, grad_fn=<NegBackward0>)
9 tensor(2.9253, grad_fn=<NegBackward0>)
10 tensor(2.8830, grad_fn=<NegBackward0>)
11 tensor(2.8464, grad_fn=<NegBackward0>)
12 tensor(2.8145, grad_fn=<NegBackward0>)
13 tensor(2.7862, grad_fn=<NegBackward0>)
14 tensor(2.7610, grad_fn=<NegBackward0>)
15 tensor(2.7385, grad_fn=<NegBackward0>)
16 tensor(2.7181, grad_fn=<NegBackward0>)
17 tensor(2.6997, grad_fn=<NegBackward0>)
18 tensor(2.6830, grad_fn=<NegBackward0>)
19 tensor(2.6677, grad_fn=<NegBackward0>)
20 tensor(2.6538, grad_fn=<NegBackward0>)
21 tensor(2.6410, grad_fn=<NegBackward0>)
22 tensor(2.6292, grad_fn=<NegBackward0>)
23 tensor(2.6183, grad_fn=<NegBackward0>)
24 tensor(2.6083, grad_fn=<NegBackward0>)
25 tensor(2.5989, grad_fn=<NegBackward0>)
26 tensor(2.5902, grad_fn=<NegBackward0>)
27 tensor(2.5821, grad_fn=<NegBackward0>)
28 tensor(2.5745, grad_fn=<NegBackward0>)
29 tensor(2.5674, grad_fn=<NegBackward0>)
30 tensor(2.5608, grad_fn=<NegBackward0>)
31 tensor(2.5545, grad_fn=<NegBackward0>)
32 tensor(2.5485, grad_fn=<NegBackward0>)
33 tensor(2.5429, grad_fn=<NegBackward0>)
34 tensor(2.5376, grad_fn=<NegBackward0>)
35 tensor(2.5326, grad_fn=<NegBackward0>)
36 tensor(2.5278, grad_fn=<NegBackward0>)
37 tensor(2.5232, grad_fn=<NegBackward0>)
38 tensor(2.5188, grad_fn=<NegBackward0>)
39 tensor(2.5147, grad_fn=<NegBackward0>)
40 tensor(2.5107, grad_fn=<NegBackward0>)
41 tensor(2.5069, grad_fn=<NegBackward0>)
42 tensor(2.5032, grad_fn=<NegBackward0>)
43 tensor(2.4997, grad_fn=<NegBackward0>)
44 tensor(2.4963, grad_fn=<NegBackward0>)
45 tensor(2.4931, grad_fn=<NegBackward0>)
46 tensor(2.4900, grad_fn=<NegBackward0>)
47 tensor(2.4870, grad_fn=<NegBackward0>)
48 tensor(2.4841, grad_fn=<NegBackward0>)
49 tensor(2.4813, grad_fn=<NegBackward0>)

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

def split_data(array):
    frac = array.shape[0] * 0.1 // 1
    train = array[:int(frac * 8)]
    val = array[int(frac * 8): int(frac * 9)]
    test = array[int(frac * 9):]
    assert train.shape[0] + test.shape[0] + val.shape[0] == array.shape[0]
    return train, val, test
train_xs_bigram, val_xs_bigram, test_xs_bigram = split_data(xs_bigram)
train_ys_bigram, val_ys_bigram, test_ys_bigram = split_data(ys_bigram)
# init and training for bigram model
W = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    # forward
    inputs = torch.nn.functional.one_hot(train_xs_bigram).float()
    logits = inputs @ W 
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(train_ys_bigram.shape[0]), train_ys_bigram].log().mean()
    
    val_inputs = torch.nn.functional.one_hot(val_xs_bigram).float()
    logits = val_inputs @ W
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    val_loss = -probs[torch.arange(val_ys_bigram.shape[0]), val_ys_bigram].log().mean()
    
    
    print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}, {test_loss.data.item()=:.4f}")
    
    # backward
    W.grad = None
    loss.backward()
    
    # updates
    W.data += -50 * W.grad
0 loss.data.item()=3.7033, val_loss.data.item()=3.6566, test_loss.data.item()=3.6431
1 loss.data.item()=3.2771, val_loss.data.item()=3.3136, test_loss.data.item()=3.3068
2 loss.data.item()=3.0436, val_loss.data.item()=3.1405, test_loss.data.item()=3.1388
3 loss.data.item()=2.9152, val_loss.data.item()=3.0434, test_loss.data.item()=3.0454
4 loss.data.item()=2.8324, val_loss.data.item()=2.9776, test_loss.data.item()=2.9817
5 loss.data.item()=2.7750, val_loss.data.item()=2.9334, test_loss.data.item()=2.9378
6 loss.data.item()=2.7319, val_loss.data.item()=2.8967, test_loss.data.item()=2.9011
7 loss.data.item()=2.6978, val_loss.data.item()=2.8670, test_loss.data.item()=2.8714
8 loss.data.item()=2.6699, val_loss.data.item()=2.8417, test_loss.data.item()=2.8460
9 loss.data.item()=2.6468, val_loss.data.item()=2.8203, test_loss.data.item()=2.8246
10 loss.data.item()=2.6274, val_loss.data.item()=2.8018, test_loss.data.item()=2.8060
11 loss.data.item()=2.6109, val_loss.data.item()=2.7859, test_loss.data.item()=2.7901
12 loss.data.item()=2.5969, val_loss.data.item()=2.7721, test_loss.data.item()=2.7763
13 loss.data.item()=2.5848, val_loss.data.item()=2.7601, test_loss.data.item()=2.7642
14 loss.data.item()=2.5743, val_loss.data.item()=2.7496, test_loss.data.item()=2.7536
15 loss.data.item()=2.5651, val_loss.data.item()=2.7404, test_loss.data.item()=2.7443
16 loss.data.item()=2.5570, val_loss.data.item()=2.7321, test_loss.data.item()=2.7361
17 loss.data.item()=2.5498, val_loss.data.item()=2.7248, test_loss.data.item()=2.7287
18 loss.data.item()=2.5434, val_loss.data.item()=2.7182, test_loss.data.item()=2.7221
19 loss.data.item()=2.5376, val_loss.data.item()=2.7123, test_loss.data.item()=2.7161
20 loss.data.item()=2.5324, val_loss.data.item()=2.7069, test_loss.data.item()=2.7107
21 loss.data.item()=2.5277, val_loss.data.item()=2.7019, test_loss.data.item()=2.7058
22 loss.data.item()=2.5233, val_loss.data.item()=2.6974, test_loss.data.item()=2.7013
23 loss.data.item()=2.5193, val_loss.data.item()=2.6932, test_loss.data.item()=2.6971
24 loss.data.item()=2.5157, val_loss.data.item()=2.6894, test_loss.data.item()=2.6933
25 loss.data.item()=2.5123, val_loss.data.item()=2.6858, test_loss.data.item()=2.6897
26 loss.data.item()=2.5091, val_loss.data.item()=2.6825, test_loss.data.item()=2.6864
27 loss.data.item()=2.5062, val_loss.data.item()=2.6794, test_loss.data.item()=2.6833
28 loss.data.item()=2.5035, val_loss.data.item()=2.6764, test_loss.data.item()=2.6804
29 loss.data.item()=2.5009, val_loss.data.item()=2.6737, test_loss.data.item()=2.6777
30 loss.data.item()=2.4985, val_loss.data.item()=2.6711, test_loss.data.item()=2.6751
31 loss.data.item()=2.4962, val_loss.data.item()=2.6687, test_loss.data.item()=2.6727
32 loss.data.item()=2.4941, val_loss.data.item()=2.6664, test_loss.data.item()=2.6705
33 loss.data.item()=2.4921, val_loss.data.item()=2.6643, test_loss.data.item()=2.6683
34 loss.data.item()=2.4902, val_loss.data.item()=2.6622, test_loss.data.item()=2.6663
35 loss.data.item()=2.4883, val_loss.data.item()=2.6603, test_loss.data.item()=2.6643
36 loss.data.item()=2.4866, val_loss.data.item()=2.6584, test_loss.data.item()=2.6625
37 loss.data.item()=2.4850, val_loss.data.item()=2.6566, test_loss.data.item()=2.6607
38 loss.data.item()=2.4834, val_loss.data.item()=2.6550, test_loss.data.item()=2.6591
39 loss.data.item()=2.4819, val_loss.data.item()=2.6534, test_loss.data.item()=2.6575
40 loss.data.item()=2.4805, val_loss.data.item()=2.6518, test_loss.data.item()=2.6559
41 loss.data.item()=2.4792, val_loss.data.item()=2.6504, test_loss.data.item()=2.6545
42 loss.data.item()=2.4779, val_loss.data.item()=2.6490, test_loss.data.item()=2.6531
43 loss.data.item()=2.4766, val_loss.data.item()=2.6476, test_loss.data.item()=2.6518
44 loss.data.item()=2.4754, val_loss.data.item()=2.6463, test_loss.data.item()=2.6505
45 loss.data.item()=2.4743, val_loss.data.item()=2.6451, test_loss.data.item()=2.6492
46 loss.data.item()=2.4732, val_loss.data.item()=2.6439, test_loss.data.item()=2.6481
47 loss.data.item()=2.4721, val_loss.data.item()=2.6427, test_loss.data.item()=2.6469
48 loss.data.item()=2.4711, val_loss.data.item()=2.6416, test_loss.data.item()=2.6458
49 loss.data.item()=2.4701, val_loss.data.item()=2.6406, test_loss.data.item()=2.6448

E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

train_xs_trigram, val_xs_trigram, test_xs_trigram = split_data(xs_trigram)
train_ys_trigram, val_ys_trigram, test_ys_trigram = split_data(ys_trigram)
def generate_name(model):
    trigram_input = ".."
    output = ""
    while True:
        model_input = torch.tensor([[ltoi[x] for x in trigram_input]])
        probs, _ = model(model_input)
        next_letter = itol[torch.multinomial(probs, 1).item()]
        if next_letter == ".":
            break
        output += next_letter
        trigram_input = output[-2:]
        
        if len(trigram_input) < 2:
            trigram_input = "." + trigram_input
            
    return output
class Model(nn.Module):
    def __init__(self, reg_weight=0):
        super().__init__()
        self.w1 = nn.Parameter(torch.randn((27, 27)))
        self.w2 = nn.Parameter(torch.randn((27, 27)))
        self.reg_weight = reg_weight
        
    def forward(self, x, targets=None):
        inputs = torch.nn.functional.one_hot(x, num_classes=27).float()
        logits = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
        probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
        
        
        loss = None
        if targets is not None:
            loss = -probs[torch.arange(targets.shape[0]), targets].log().mean()
            
            if self.reg_weight:
                loss += self.reg_weight * torch.cat((self.w1**2, self.w2**2)).sum()

        return probs, loss
    
    def update(self, lr):
        self.w1.data += -lr * self.w1.grad
        self.w2.data += -lr * self.w2.grad
m = Model(1e-3)
probs, loss = m(train_xs_trigram, targets=train_ys_trigram)
m.zero_grad()
print(loss)
tensor(4.5851, grad_fn=<AddBackward0>)
loss.backward()
m.update(20)
loss
tensor(3.5196, grad_fn=<NegBackward0>)
probs, loss = m(train_xs_trigram, targets=train_ys_trigram)
loss.data.item()
3.7461421489715576
for reg in [1e-3]: #[0, 1e-10, 1e-5, 1e-3]:
    m = Model(reg_weight=reg)
    for i in range(50):
        probs, loss = m(train_xs_trigram, targets=train_ys_trigram)
        _, val_loss = m(val_xs_trigram, targets=val_ys_trigram)
        _, test_loss = m(test_xs_trigram, targets=test_ys_trigram)
        
        # print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}")
        
        m.zero_grad()
        loss.backward()
        m.update(20)

    print(f"{reg=}", f"{loss.data.item()=:.4f}", f"{val_loss.data.item()=:.4f}", f"{test_loss.data.item()=:.4f}")
reg=0.001 loss.data.item()=2.6747 val_loss.data.item()=2.8234 test_loss.data.item()=2.8281
generate_name(m)
'daca'

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we’d prefer to use F.cross_entropy instead?

E06: meta-exercise! Think of a fun/interesting exercise and complete it.