import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
Load the data...
with open('mnist.npy','rb') as f:
pics=np.load(f)
n,k,_=pics.shape
k=k**2
pics=np.reshape(pics,(n,k))
targets=np.load(f)
At this point, pics is a 60000x784 data matrix, and targets is a length-60000 vector of integers, where an integer states what class the image is in.
We make the labels matrix, which is 60000x10, where each row is "one-hot," meaning only one element is 1, and the rest are 0. For example, suppose the 10th datapoint is in class 4. In this case, the 10th row of labels would be all zeros, except for a 1 in class 4. You can think of this as a probability - there's a 100% chance this is a 4, because an expert said so and labeled it as such, and a 0% chance it's anything else.
labels=np.zeros((n,10))
for i in range(n):
labels[i,targets[i]]=1
Here we look at the sizes of pics and labels, as well as the first five elements of targets and labels, so we can see how they relate.
print(pics.shape,labels.shape)
print(targets[:5])
print(labels[:5,:])
(60000, 784) (60000, 10) [5 0 4 1 9] [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Let's classify. Our output layer needs to have 10 outputs, to output the logits of all 10 classes.
class MnistClass(nn.Module):
def __init__(self,width):
super(MnistClass, self).__init__()
self.hidden=nn.Linear(k,width)
self.hidden_act=nn.ReLU()
self.hidden2=nn.Linear(width,width)
self.hidden2_act=nn.ReLU()
self.output=nn.Linear(width,10)
def forward(self,x):
x=self.hidden(x)
x=self.hidden_act(x)
x=self.hidden2(x)
x=self.hidden2_act(x)
x=self.output(x)
return x
To use our GPU, we need to move the data and model to the GPU. Note the two options below - the first results in much faster training.
Setup without .to('cuda')
X=torch.tensor(pics,dtype=torch.float32)
y=torch.tensor(labels,dtype=torch.float32)
model=MnistClass(50)
EPOCHS=100
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.01)
Setup with .to('cuda')
X=torch.tensor(pics,dtype=torch.float32).to('cuda')
y=torch.tensor(labels,dtype=torch.float32).to('cuda')
model=MnistClass(50).to('cuda')
EPOCHS=100
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.01)
training
for epoch in range(EPOCHS):
predictions=model(X)
loss=criterion(predictions,y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch%20==0:
print(f'epoch: {epoch} loss: {loss.item():.4f}')
epoch: 0 loss: 16.3369 epoch: 20 loss: 1.3047 epoch: 40 loss: 0.7483 epoch: 60 loss: 0.5660 epoch: 80 loss: 0.4800
Here are the predictions from the first data point. Parenthetically, note the .cpu(), which is necessary to turn it into a numpy array (as numpy does not run on the GPU).
logits=model(X[0,:]).detach().cpu().numpy()
print(logits)
[11.413922 8.107069 10.413327 15.74374 -1.8922095 14.201479 3.1337726 8.0927725 12.263035 11.015973 ]
Doing softmax by hand, resulting in actual probabilities... Turns out this is either a 3 or 5, most likely a 3, according to the model.
probs=np.exp(logits)/sum(np.exp(logits))
for i in range(10):
print(f'{i}:',probs[i])
0: 0.010349646 1: 0.00037912052 2: 0.0038051568 3: 0.78585327 4: 1.722447e-08 5: 0.1680916 6: 2.6236273e-06 7: 0.00037373902 8: 0.024193035 9: 0.0069518173
See, the sum is about 1, which a sum of probabilities should be.
sum(probs)
1.000000018043698
Here we call Softmax from pytorch, rather than doing it manually. Note the probabilities are the same as above.
probs=nn.Softmax(dim=0)(model(X[0,:]).detach())
print(probs)
tensor([1.0350e-02, 3.7912e-04, 3.8052e-03, 7.8585e-01, 1.7224e-08, 1.6809e-01,
2.6236e-06, 3.7374e-04, 2.4193e-02, 6.9518e-03], device='cuda:0')
Here are all the predictions. Some are right, some are wrong.
logits=model(X).detach().cpu().numpy()
predLabels=np.argmax(logits,axis=1)
print(predLabels)
print(targets)
[3 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
Confusion matrix! Which classes are confused for each other?
import sklearn
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(predLabels,targets)
print(cm)
[[5553 3 66 40 7 95 108 27 15 29] [ 0 6439 47 29 12 36 13 47 123 6] [ 52 39 5063 202 23 40 111 43 157 6] [ 38 37 178 5101 8 239 1 98 343 111] [ 8 6 79 14 5155 17 79 101 19 248] [ 88 28 31 242 8 4301 102 6 254 30] [ 114 7 216 18 201 126 5437 4 93 44] [ 24 26 56 190 32 61 5 5668 41 299] [ 39 149 204 261 61 386 56 65 4666 132] [ 7 8 18 34 335 120 6 206 140 5044]]
import plotly.graph_objects as go
# Create a Plotly heatmap
fig = go.Figure(data=go.Heatmap(
z=cm,
x=[f'Predicted {i}' for i in range(cm.shape[0])],
y=[f'Actual {i}' for i in range(cm.shape[0])],
colorscale='Viridis', # You can choose any colorscale you like
colorbar=dict(title='Count')
))
# Update layout
fig.update_layout(
title='Confusion Matrix',
xaxis_title='Predicted label',
yaxis_title='Actual label',
)
# Show plot
fig.show()
With Train/test split¶
Here we demonstrate how to use train/test splits.
from sklearn.model_selection import train_test_split
Xtr,Xte,ytr,yte=train_test_split(X,y)
print(Xtr.shape,ytr.shape)
print(Xte.shape,yte.shape)
torch.Size([45000, 784]) torch.Size([45000, 10]) torch.Size([15000, 784]) torch.Size([15000, 10])
model=MnistClass(50).to('cuda')
EPOCHS=1000
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.001)
Xtr=Xtr.to('cuda')
Xte=Xte.to('cuda')
for epoch in range(EPOCHS):
predictions=model(Xtr)
loss=criterion(predictions,ytr)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch%50==0:
print(f'epoch: {epoch} training loss: {loss.item():.4f}')
with torch.no_grad():
predictions=model(Xte)
test_loss=criterion(predictions,yte)
print(f' testing loss: {test_loss.item():.4f}')
epoch: 0 training loss: 14.3887
testing loss: 8.0118
epoch: 50 training loss: 0.3987
testing loss: 0.4146
epoch: 100 training loss: 0.2478
testing loss: 0.2872
epoch: 150 training loss: 0.1900
testing loss: 0.2483
epoch: 200 training loss: 0.1551
testing loss: 0.2295
epoch: 250 training loss: 0.1313
testing loss: 0.2179
epoch: 300 training loss: 0.1129
testing loss: 0.2109
epoch: 350 training loss: 0.0981
testing loss: 0.2073
epoch: 400 training loss: 0.0859
testing loss: 0.2064
epoch: 450 training loss: 0.0753
testing loss: 0.2070
epoch: 500 training loss: 0.0663
testing loss: 0.2104
epoch: 550 training loss: 0.0585
testing loss: 0.2146
epoch: 600 training loss: 0.0516
testing loss: 0.2203
epoch: 650 training loss: 0.0455
testing loss: 0.2275
epoch: 700 training loss: 0.0403
testing loss: 0.2354
epoch: 750 training loss: 0.0357
testing loss: 0.2443
epoch: 800 training loss: 0.0316
testing loss: 0.2532
epoch: 850 training loss: 0.0279
testing loss: 0.2627
epoch: 900 training loss: 0.0246
testing loss: 0.2728
epoch: 950 training loss: 0.0216
testing loss: 0.2842
When your data gets big...¶
Here we have a demo that shows how to make a Da
Here we've made an example pytorch Dataset. It's made up of three things - a constructor, where you create whatever fields you need, __len__, which is necessary to output the number of elements in the dataset, and __getitem__, which defines how to get the idx-th element of the dataset.
This is a very simple example, which still requires pulling the full dataset into memory to store into self.X and self.y. Suppose our dataset was a whole lot of pictures, and this was unrealistic. Our constructor might store a file directory, __len__ might count the number of images in the directory, and __getitem__ might load and return the idx-th picture in the directory.
from torch.utils.data import Dataset
class MnistDataset(Dataset):
def __init__(self,X,y):
self.X=X
self.y=y
def __len__(self):
n,_=self.X.shape
return n
def __getitem__(self,idx):
return self.X[idx,:],self.y[idx,:]
trainingSet=MnistDataset(Xtr,ytr)
testingSet=MnistDataset(Xte,yte)
from torch.utils.data import DataLoader
trainLoader=DataLoader(trainingSet,batch_size=1000,shuffle=True)
testLoader=DataLoader(testingSet,batch_size=1000,shuffle=False)
This example shows tensorboard, which requires a conda/mamba install tensorboard to add it to the virtual environment.
run tensorboard --logdir=runs to get the URL to explore the output.
from torch.utils.tensorboard import SummaryWriter
writer=SummaryWriter()
model=MnistClass(50).to('cuda')
EPOCHS=100
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=.001)
for epoch in range(EPOCHS):
totalloss=0
for batch, (X,y) in enumerate(trainLoader):
predictions=model(X.to('cuda'))
loss=criterion(predictions,y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
totalloss+=loss
totalloss/=len(trainLoader)
writer.add_scalar('Loss/train',totalloss,epoch)
test_loss=0
with torch.no_grad():
for X,y in testLoader:
pred=model(X.to('cuda'))
test_loss+=criterion(pred,y).item()
test_loss/=len(testLoader)
writer.add_scalar('Loss/test',test_loss,epoch)