AI - Computer Vision - linear/non-linear neural networks

Introduction

computer vision is used for:
- object detection (eg. is there a car in the image if so place a box around it)
- object classification (eg. what type of object is in the image)
- image segregation (eg. isolate an object within an image - such as for image semantic masking)
  - smartphones use in-camera panoptic segmentation via transformers to blur backgrounds, remove unwanted objects, enhance faces, etc
- combine camera views into a 3D vector space model and ascertain motion (eg. Tesla driving uses 8 cameras)
workflow is similar to AI deep learning but computer vision usually uses either CNN or transformer neural networks rather than this linear/non-linear model architecture

A linear/non-linear NN computer vision model

initial code

import torch
from torch import nn #neural networks

import torchvision
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import ToTensor

import matplotlib.pyplot as plt

input images

#set up training data
train_data = datasets.FashionMNIST(
                              root="data", #destination folder to download to
                              train=True, #just the training dataset
                              download=True,
                              transform=torchvision.transforms.ToTensor(), #convert images to a tensor
                              target_transform=NONE #transforming the labels
                              )

test_data = datasets.FashionMNIST(
                              root="data", #destination folder to download to
                              train=False, #just the test dataset
                              download=True,
                              transform=torchvision.transforms.ToTensor(), #convert images to a NCHW tensor NB. color channel values of 0..255 are converted to range 0..1
                              target_transform=NONE #transforming the labels
                              )

visualise the data

 #check how much data was imported:
len(train_data), len(test_data) #for this dataset there should be 60,000 and 10,000 respectively

#see what our Y label class names are: (eg. dress, trousers)
class_names = train_data.classes
print(class_names)

# see what Y index value they correspond to:
class_to_idx = train.data.class_to_idx
print(class_to_idx)

#check the shapes of the tensors:
print(f"Image shape: {image.shape} -> [color_channels, height, width]")
print(f"Image label: {class_names[label]}")

#visualise data

import matpltlib as plt
image, label = train_data[0]
print(f"Image shape: {image.shape}")
#matplotlib imshow() expects tensor of h,w or h,w c so we either need to drop the color via doing a squeeze to remove the 1st dimension or do a tensor transform

plt.imshow(image.squeeze(), cmap="gray") #opt. cmap to display in gray scale given we have dropped the color channel
plt.title(class_names[label])
plt.axis=False #don't need the H and W axis numbers

#view random images in a grid:

fig = plt.figure(figsize=(9,9))
rows, cols = 4, 4
for i in range(1, rows*cols+1)
     random_idx = torch.randint(0, len(train_data), size=[1]).item()
     img, label = train_data[random_idx]
     fig.add_subplot(rows, cols, i)
     plt.imshow(image.squeeze(), cmap="gray") 
     plt.title(class_names[label])
     plt.axis=False

Transform image dataset into a batches of Python iterable using DataLoader

this is to to reduce RAM impacts and allow mini-batching of gradient descents which makes learning more efficient per epoch

from torch.utils.data import DataLoader

BATCH_SIZE=32 #process 32 images at a time

train_dataloader = DataLoader(dataset=train_data,
                            batch_size=BATCH_SIZE, 
                            shuffle=True #randomly pick images to go into each batch in case images are ordered by class type
                            )
        

test_dataloader = DataLoader(dataset=test_data,
                            batch_size=BATCH_SIZE, 
                            shuffle=False #no need to with the test dataset
                            )

Create a baseline model

#create a flatten layer to reduce a dimension by making a new value H*W and removing H and W so we can use a linear layer which can't use the extra dimension
flatten_model = nn.Flatten()

#create model class

from torch import nn
class FashionMNISTModel(nn.Module):
    def __init__(self,
                 input_shape:int, 
                 hidden_units: int,
                 output_shape: int):
        super() __init__()
        self.layer.stack = nn.Sequential(
             nn.flatten(),
             nn.Linear(in_features=input_shape, #this needs to be the output shape of flatten ie, H*W
                       out_features=hidden_units), #this can be something like 10
             nn.Linear(in_features=hidden_units,
                       out_features=output_shape), #this is the number of classes or categories ie. len(class_names)
    )
    def forward(self,x):
      return self.layer_stack(x)

#create instance of the model

model_0 = FashionMNISTModel(
               input_shape=28*28,
               hidden_units=10,
               output_shape=10
          )

Create a timer

#create a timing process
from timeit import default_timer as timer
def print_train_time(start:float, 
                     end:float,
                     device: torch.device = None):
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time
    
#usage:
start_time = timer() #place at start of code 
#some code here
end_time = timer() #place at end of code
print_train_time(start=start_time, end=end_time, device = "cpu")

Train the model

#loss function and optimizer
loss_fn = nn.CrossEntropyLoss() #loss gets calculated per batch not per epoch
optimizer = torch.optim.SGD((params=model_0.parameters(), # parameters of target model to optimize
                            lr=0.01) #this will update once per batch rather than per epoch
                            
accuracy = torchmetrics.accuracy() ?   

from tqdm.auto import tqdm #progress bar

train_start_time = timer()

epochs = 3

for epoch in tqdm(range(epochs)):
   print(f"Epoch: {epochs}\n------")
   train_loss = 0 
   for batch, (X,y) in enumerate(train_dataloader):
     model_0.train()
     y_pred = model_0(X)
     loss= loss_fn(y_pred, y) #loss per batch
     train_loss += loss #total loss per epoch
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()
 
#print what's happening:
     if batch %% 400 == 0 #inside batch loop
       print(f"Looked at {batch *len(X)}/{len(train_dataloader.dataset} samples")

  train_loss /= len(train_dataloader) #inside epoch loop

#testing

test_loss, test_acc = 0, 0
model_0.eval()
with torch.inference_mode():
  for X_test, y_test in test_dataloader:
    test_pred = model_0(X_test)
    test_loss += loss_fn(test_pred, y_test)
    test_acc += accuracy_fn(y_true=y_test, y_pred=test_pred.argmax(dim=1) ) #convert from test_pred logits
  test_loss /= len(test_data) #get average loss per batch
  test_acc /= len(test_dataloader)
  
#print whats happening:
print(f"\nTrain loss: {train_loss:.4f} | Test loss: {test_loss:.4f}  | Test acc: {test_acc:.4f} ")

train_end_time = timer()

total_train_time = print_train_time(start=train_start_time,
                                    end=train_end_time,
                                    device=str(next(model_0.parameters() ).device ) )

Table of Contents

AI - Computer Vision - linear/non-linear neural networks

Introduction

A linear/non-linear NN computer vision model

initial code

input images

visualise the data

Transform image dataset into a batches of Python iterable using DataLoader

Create a baseline model

Create a timer

Train the model