In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

Load the data...

In [2]:
with open('mnist.npy','rb') as f:
    pics=np.load(f)
    n,k,_=pics.shape
    k=k**2
    pics=np.reshape(pics,(n,k))
    targets=np.load(f)

At this point, pics is a 60000x784 data matrix, and targets is a length-60000 vector of integers, where an integer states what class the image is in.

We make the labels matrix, which is 60000x10, where each row is "one-hot," meaning only one element is 1, and the rest are 0. For example, suppose the 10th datapoint is in class 4. In this case, the 10th row of labels would be all zeros, except for a 1 in class 4. You can think of this as a probability - there's a 100% chance this is a 4, because an expert said so and labeled it as such, and a 0% chance it's anything else.

In [3]:
labels=np.zeros((n,10))
for i in range(n):
    labels[i,targets[i]]=1

Here we look at the sizes of pics and labels, as well as the first five elements of targets and labels, so we can see how they relate.

In [4]:
print(pics.shape,labels.shape)
print(targets[:5])
print(labels[:5,:])
(60000, 784) (60000, 10)
[5 0 4 1 9]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

Let's classify. Our output layer needs to have 10 outputs, to output the logits of all 10 classes.

In [5]:
class MnistClass(nn.Module):
    def __init__(self,width):
        super(MnistClass, self).__init__()
        self.hidden=nn.Linear(k,width)
        self.hidden_act=nn.ReLU()
        self.hidden2=nn.Linear(width,width)
        self.hidden2_act=nn.ReLU()
        self.output=nn.Linear(width,10)
    def forward(self,x):
        x=self.hidden(x)
        x=self.hidden_act(x)
        x=self.hidden2(x)
        x=self.hidden2_act(x)
        x=self.output(x)
        return x

To use our GPU, we need to move the data and model to the GPU. Note the two options below - the first results in much faster training.

Setup without .to('cuda')

In [6]:
X=torch.tensor(pics,dtype=torch.float32)
y=torch.tensor(labels,dtype=torch.float32)
In [7]:
model=MnistClass(50)

EPOCHS=100

criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.01)

Setup with .to('cuda')

In [8]:
X=torch.tensor(pics,dtype=torch.float32).to('cuda')
y=torch.tensor(labels,dtype=torch.float32).to('cuda')
In [9]:
model=MnistClass(50).to('cuda')

EPOCHS=100

criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.01)

training

In [10]:
for epoch in range(EPOCHS):
    predictions=model(X)
    loss=criterion(predictions,y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch%20==0:
        print(f'epoch: {epoch} loss: {loss.item():.4f}')
epoch: 0 loss: 16.3369
epoch: 20 loss: 1.3047
epoch: 40 loss: 0.7483
epoch: 60 loss: 0.5660
epoch: 80 loss: 0.4800

Here are the predictions from the first data point. Parenthetically, note the .cpu(), which is necessary to turn it into a numpy array (as numpy does not run on the GPU).

In [11]:
logits=model(X[0,:]).detach().cpu().numpy()
print(logits)
[11.413922   8.107069  10.413327  15.74374   -1.8922095 14.201479
  3.1337726  8.0927725 12.263035  11.015973 ]

Doing softmax by hand, resulting in actual probabilities... Turns out this is either a 3 or 5, most likely a 3, according to the model.

In [12]:
probs=np.exp(logits)/sum(np.exp(logits))
for i in range(10):
    print(f'{i}:',probs[i])
0: 0.010349646
1: 0.00037912052
2: 0.0038051568
3: 0.78585327
4: 1.722447e-08
5: 0.1680916
6: 2.6236273e-06
7: 0.00037373902
8: 0.024193035
9: 0.0069518173

See, the sum is about 1, which a sum of probabilities should be.

In [13]:
sum(probs)
Out[13]:
1.000000018043698

Here we call Softmax from pytorch, rather than doing it manually. Note the probabilities are the same as above.

In [14]:
probs=nn.Softmax(dim=0)(model(X[0,:]).detach())
print(probs)
tensor([1.0350e-02, 3.7912e-04, 3.8052e-03, 7.8585e-01, 1.7224e-08, 1.6809e-01,
        2.6236e-06, 3.7374e-04, 2.4193e-02, 6.9518e-03], device='cuda:0')

Here are all the predictions. Some are right, some are wrong.

In [15]:
logits=model(X).detach().cpu().numpy()
predLabels=np.argmax(logits,axis=1)
print(predLabels)
print(targets)
[3 0 4 ... 5 6 8]
[5 0 4 ... 5 6 8]

Confusion matrix! Which classes are confused for each other?

In [16]:
import sklearn
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(predLabels,targets)
print(cm)
[[5553    3   66   40    7   95  108   27   15   29]
 [   0 6439   47   29   12   36   13   47  123    6]
 [  52   39 5063  202   23   40  111   43  157    6]
 [  38   37  178 5101    8  239    1   98  343  111]
 [   8    6   79   14 5155   17   79  101   19  248]
 [  88   28   31  242    8 4301  102    6  254   30]
 [ 114    7  216   18  201  126 5437    4   93   44]
 [  24   26   56  190   32   61    5 5668   41  299]
 [  39  149  204  261   61  386   56   65 4666  132]
 [   7    8   18   34  335  120    6  206  140 5044]]
In [17]:
import plotly.graph_objects as go
# Create a Plotly heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=[f'Predicted {i}' for i in range(cm.shape[0])],
    y=[f'Actual {i}' for i in range(cm.shape[0])],
    colorscale='Viridis',  # You can choose any colorscale you like
    colorbar=dict(title='Count')
))

# Update layout
fig.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted label',
    yaxis_title='Actual label',
)

# Show plot
fig.show()

With Train/test split¶

Here we demonstrate how to use train/test splits.

In [18]:
from sklearn.model_selection import train_test_split
Xtr,Xte,ytr,yte=train_test_split(X,y)
In [19]:
print(Xtr.shape,ytr.shape)
print(Xte.shape,yte.shape)
torch.Size([45000, 784]) torch.Size([45000, 10])
torch.Size([15000, 784]) torch.Size([15000, 10])
In [20]:
model=MnistClass(50).to('cuda')

EPOCHS=1000

criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.001)

Xtr=Xtr.to('cuda')
Xte=Xte.to('cuda')
for epoch in range(EPOCHS):
    predictions=model(Xtr)
    loss=criterion(predictions,ytr)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch%50==0:
        print(f'epoch: {epoch} training loss: {loss.item():.4f}')
        with torch.no_grad():
            predictions=model(Xte)
            test_loss=criterion(predictions,yte)
            print(f'      testing loss: {test_loss.item():.4f}')
epoch: 0 training loss: 14.3887
      testing loss: 8.0118
epoch: 50 training loss: 0.3987
      testing loss: 0.4146
epoch: 100 training loss: 0.2478
      testing loss: 0.2872
epoch: 150 training loss: 0.1900
      testing loss: 0.2483
epoch: 200 training loss: 0.1551
      testing loss: 0.2295
epoch: 250 training loss: 0.1313
      testing loss: 0.2179
epoch: 300 training loss: 0.1129
      testing loss: 0.2109
epoch: 350 training loss: 0.0981
      testing loss: 0.2073
epoch: 400 training loss: 0.0859
      testing loss: 0.2064
epoch: 450 training loss: 0.0753
      testing loss: 0.2070
epoch: 500 training loss: 0.0663
      testing loss: 0.2104
epoch: 550 training loss: 0.0585
      testing loss: 0.2146
epoch: 600 training loss: 0.0516
      testing loss: 0.2203
epoch: 650 training loss: 0.0455
      testing loss: 0.2275
epoch: 700 training loss: 0.0403
      testing loss: 0.2354
epoch: 750 training loss: 0.0357
      testing loss: 0.2443
epoch: 800 training loss: 0.0316
      testing loss: 0.2532
epoch: 850 training loss: 0.0279
      testing loss: 0.2627
epoch: 900 training loss: 0.0246
      testing loss: 0.2728
epoch: 950 training loss: 0.0216
      testing loss: 0.2842

When your data gets big...¶

Here we have a demo that shows how to make a Da

Here we've made an example pytorch Dataset. It's made up of three things - a constructor, where you create whatever fields you need, __len__, which is necessary to output the number of elements in the dataset, and __getitem__, which defines how to get the idx-th element of the dataset.

This is a very simple example, which still requires pulling the full dataset into memory to store into self.X and self.y. Suppose our dataset was a whole lot of pictures, and this was unrealistic. Our constructor might store a file directory, __len__ might count the number of images in the directory, and __getitem__ might load and return the idx-th picture in the directory.

In [21]:
from torch.utils.data import Dataset
class MnistDataset(Dataset):
    def __init__(self,X,y):
        self.X=X
        self.y=y
    def __len__(self):
        n,_=self.X.shape
        return n
    def __getitem__(self,idx):
        return self.X[idx,:],self.y[idx,:]
trainingSet=MnistDataset(Xtr,ytr)
testingSet=MnistDataset(Xte,yte)
In [22]:
from torch.utils.data import DataLoader

trainLoader=DataLoader(trainingSet,batch_size=1000,shuffle=True)
testLoader=DataLoader(testingSet,batch_size=1000,shuffle=False)

This example shows tensorboard, which requires a conda/mamba install tensorboard to add it to the virtual environment.

run tensorboard --logdir=runs to get the URL to explore the output.

In [23]:
from torch.utils.tensorboard import SummaryWriter
writer=SummaryWriter()
In [ ]:
model=MnistClass(50).to('cuda')

EPOCHS=100

criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.001)
for epoch in range(EPOCHS):
    totalloss=0
    for batch, (X,y) in enumerate(trainLoader):
        predictions=model(X.to('cuda'))
        loss=criterion(predictions,y)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        totalloss+=loss

    totalloss/=len(trainLoader)
    writer.add_scalar('Loss/train',totalloss,epoch)
    test_loss=0
        
    with torch.no_grad():
        for X,y in testLoader:
            pred=model(X.to('cuda'))
            test_loss+=criterion(pred,y).item()
        test_loss/=len(testLoader)
    writer.add_scalar('Loss/test',test_loss,epoch)
In [ ]: