from google.colab import drive
'/content/drive') drive.mount(
Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
galopy
August 31, 2023
We learned about callbacks last time. In this blog, we will learn about a learner. It’s common to use a fit function to train a model. However, as we want to modify behaviors and add more functionalities, the fit function needs to be changed constantly. Soon, the function gets very complicated.
We can simplify the code with a learner by adding functionalities with callbacks. This way, we can try different strategies quickly without changing the learner. This blog is based on lesson 16 of the FastAI course.
Let’s first start with grabbing tools and a dataset from hugging face. We will use Fashion MNIST data.
Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 519.3/519.3 kB 6.1 MB/s eta 0:00:00
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.3/115.3 kB 10.7 MB/s eta 0:00:00
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 14.7 MB/s eta 0:00:00
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 10.8 MB/s eta 0:00:00
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 19.3 MB/s eta 0:00:00
import matplotlib as mpl
import torchvision.transforms.functional as TF
from contextlib import contextmanager
from torch import nn,tensor
from datasets import load_dataset,load_dataset_builder
from miniai.datasets import *
from miniai.conv import *
import logging
from fastcore.test import test_close
Okay, we imported all the libraries we need. Now we grab the data and shape it like we did before.
x,y = 'image','label'
name = "fashion_mnist"
builder = load_dataset_builder(name)
dsd_features = builder.info.features.copy()
dsd_features['image'] = Array2D(shape=[1, 28*28], dtype='float32')
dsd = load_dataset(name, features=dsd_features)
dsd.set_format(type="torch")
@inplace
def sq(b): b[x] = [o.squeeze().div(255) for o in b[x]]
tds = dsd.map(sq, batched=True)
bs = 1024
dls = DataLoaders.from_dd(tds, bs)
Let’s start with a basic learner. It grabs all the pieces it needs to train a model and divides the fitting with one_epoch
and one_batch
. In this basic version, there is no callback yet.
class Learner:
def __init__(self, model, dls, loss_func, lr=0.2, opt_func=optim.SGD): fc.store_attr()
def fit(self, n_epochs):
self.opt = self.opt_func(self.model.parameters(), self.lr)
for self.epoch in range(n_epochs):
self.one_epoch(True)
torch.no_grad()(self.one_epoch)(False)
def one_epoch(self, train):
self.losses, self.accs, self.ns = [], [], []
self.model.training = train
self.dl = self.dls.train if train else self.dls.valid
for self.n, self.batch in enumerate(self.dl): self.one_batch()
n = sum(self.ns)
fit_acc = sum(self.accs) / n
fit_loss = sum(self.losses) / n
print(f'epoch: {self.epoch}, accuracy: {fit_acc}, loss: {fit_loss}, training: {train}')
def one_batch(self):
xb, yb = self.batch
n = len(xb)
self.preds = self.model(xb)
self.loss = self.loss_func(self.preds, yb)
acc = (yb == self.preds.argmax(dim=1)).float().sum()
self.losses.append(self.loss * n)
self.accs.append(acc)
self.ns.append(n)
if self.model.training:
self.loss.backward()
self.opt.step()
self.opt.zero_grad()
epoch: 0, accuracy: 0.6245333552360535, loss: 1.161386489868164, training: True
epoch: 0, accuracy: 0.6980000138282776, loss: 0.8195666074752808, training: False
epoch: 1, accuracy: 0.7442666888237, loss: 0.704105019569397, training: True
epoch: 1, accuracy: 0.7699000239372253, loss: 0.6478453278541565, training: False
Yay! We have a learner! Learner basically has this shape, with fit, one_epoch, one_batch. However when we want to add a feature, we have to change the learner. Let’s make it more flexible by using callbacks. With callbacks, we can just add or remove them to modify the behavior of the learner.
Before we jump into a learner with callbacks, let’s look at some exceptions. What is an exception? It can be raised to exit a program.
hello
bye
As we can see, the program printed ‘hello’, but did not print ‘bye’. It raised an exception and exited. However, having a traceback is not very pretty. We can also exit without all that by using try and except block.
With this, we can modify how we train the model. We can exit early or skip certain batch or epochs. Here are some customized Exceptions we will use.
Here is a simple callback class.
To call it, we sort the callbacks by the order first and try to find the method that matches with the name provided. If it exists, it calls it.
Here is a simple callback that counts how many batches it has.
cb = CountBatchCB()
run_cb([cb], 'before_fit')
run_cb([cb], 'after_batch')
run_cb([cb], 'after_batch')
run_cb([cb], 'after_fit')
Total batch count: 2
Now let’s go over the learner with callbacks. This learner calls six callbacks: before_fit
, after_fit
, before_epoch
, after_epoch
, before_batch
, and after_batch
. With callbacks, we can modify how it trains. We can even stop training with an exception.
class Learner:
def __init__(self, model, dls, loss_func, cbs=[], lr=0.2, opt_func=optim.SGD): fc.store_attr()
def callback(self, name): run_cb(self.cbs, name, self)
def fit(self, n_epochs):
self.opt = self.opt_func(self.model.parameters(), self.lr)
self.epochs = range(n_epochs)
try:
self.callback('before_fit')
for self.epoch in self.epochs:
self.one_epoch(True)
torch.no_grad()(self.one_epoch)(False)
self.callback('after_fit')
except CancelFitException: pass
def one_epoch(self, train):
self.model.training = train
self.dl = self.dls.train if train else self.dls.valid
try:
self.callback('before_epoch')
for self.n, self.batch in enumerate(self.dl): self.one_batch()
self.callback('after_epoch')
except CancelEpochException: pass
def one_batch(self):
try:
self.callback('before_batch')
self.xb, self.yb = self.batch
self.preds = self.model(self.xb)
self.loss = self.loss_func(self.preds, self.yb)
if self.model.training:
self.loss.backward()
self.opt.step()
self.opt.zero_grad()
self.callback('after_batch')
except CancelBatchException: pass
We can create a NBatchCB
to only train for N batches. Then, we exit the training with CancelFitException
.
Our learner is so much more flexible than before. As we write more callbacks, we can feel how powerful it is. At first, it may feel overwhelming because there are so many things to try, but not sure what is helpful. We can explore more callbacks and get a feeling on how it really works.
We do not have a metric for this learner, so we don’t know how well it is doing. Instead, we can add any metric as a callback. Let’s create a metric callback.
Before creating a metric callback, let’s create a Metric
class. This class serves as a base class for Accuracy
. It can reset internal state, calculate metric, update the internal state by computing, and has a value property.
class Metric:
def __init__(self): self.reset()
def reset(self):
self.ns, self.accs = [], []
def update(self, preds, targs):
res = self.calculate(preds, targs)
n = len(preds)
self.ns.append(n)
self.accs.append(res * n)
def calculate(self, preds, targs): return 0
@property
def value(self):
return tensor(self.accs).sum() / tensor(self.ns).sum()
We can create other metrics by inheriting and defining compute
.
Torcheval has MulticlassAccuracy
, which is the same thing as we just defined. Let’s use that. Instead of value, we have to use compute
to get the calculation.
Collecting torcheval
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/179.2 kB ? eta -:--:-- ━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━ 112.6/179.2 kB 3.4 MB/s eta 0:00:01 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 179.2/179.2 kB 3.7 MB/s eta 0:00:00
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torcheval) (4.7.1)
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7
tensor(1.)
Now we can create a metrics callback. This will include all the metrics and loss. We can grab the loss from the learner because callbacks have access to the learner, which means more power and flexibility. For instance, we can grab the loss from the learner.
class MetricsCB(Callback):
def __init__(self, *ms, device=def_device, **metrics):
for m in ms:
metrics[type(m).__name__] = m
self.metrics = metrics
self.all_metrics = copy(metrics)
self.all_metrics['loss'] = self.loss = Mean()
self.loss.to(device)
def _log(self, log):
print(log)
def before_epoch(self, learn):
for m in self.all_metrics.values(): m.reset()
def after_batch(self, learn):
for m in self.metrics.values():
m.update(learn.preds, learn.yb)
# import pdb; pdb.set_trace()
self.loss.update(learn.loss)
def after_epoch(self, learn):
log = {k:f'{v.compute().item():.3f}' for k, v in self.all_metrics.items()}
log['epoch'] = learn.epoch
log['train'] = learn.model.training
self._log(log)
model = get_model()
metrics = MetricsCB(accuracy=MulticlassAccuracy())
learn = Learner(model, dls, F.cross_entropy, lr=0.2, cbs=[metrics])
learn.fit(2)
{'accuracy': '0.618', 'loss': '1.150', 'epoch': 0, 'train': True}
{'accuracy': '0.707', 'loss': '0.790', 'epoch': 0, 'train': False}
{'accuracy': '0.748', 'loss': '0.699', 'epoch': 1, 'train': True}
{'accuracy': '0.757', 'loss': '0.654', 'epoch': 1, 'train': False}
We can also use callbacks to move model and data batch into any device, such as cuda, apple GPU, or CPU. By default, it will use def_device
, which means GPU if the computer has one available.
model = get_model()
metrics = MetricsCB(accuracy=MulticlassAccuracy())
learn = Learner(model, dls, F.cross_entropy, lr=0.2, cbs=[metrics, DeviceCB()])
learn.fit(2)
{'accuracy': '0.623', 'loss': '1.155', 'epoch': 0, 'train': True}
{'accuracy': '0.729', 'loss': '0.778', 'epoch': 0, 'train': False}
{'accuracy': '0.748', 'loss': '0.707', 'epoch': 1, 'train': True}
{'accuracy': '0.739', 'loss': '0.711', 'epoch': 1, 'train': False}
In this blog, we looked at two versions of a learner and how we increased flexibility with callbacks. With callbacks, we can use different metrics without changing the learner. Also, we can automatically use GPU if it is available. We can exit a batch, epoch, or fit entirely by raising an exception as well. In part two, we will look at more callbacks and a different version of the learner, which has more power and concise.