#hide
# !pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
#hide
from fastai.vision.all import *
from fastbook import *

matplotlib.rc('image', cmap='Greys')
import inspect
def print_source(obj):
    for line in inspect.getsource(obj).split("\n"):
        print(line)

Convolutional Neural Networks

The Magic of Convolutions

  • feature engineering
    • creating new transformations of the input data in order to make it easier to the model
    • one of the most powerful tools machine learning practitioners have at their disposal
  • a feature is a transformation of the data that is designed to make it easier to the model

Convolution

  • applies a kernel across an image
    • multiplies each element of an $NxN$ size kernel by each element of an $NxN$ block of an image and adds the results together
  • kernel: a little matrix

A guide to convolution arithmetic for deep learning

  • provides many great diagrams showing how image kernels can be applied
# A convolutional kernel that finds top edges (i.e. dark on bottom, light on top)
top_edge = tensor([[-1,-1,-1],
                   [ 0, 0, 0],
                   [ 1, 1, 1]]).float()
path = untar_data(URLs.MNIST_SAMPLE)
path
Path('/home/innom-dt/.fastai/data/mnist_sample')
im3 = Image.open(path/'train'/'3'/'12.png')
show_image(im3);

png

show_image
<function fastai.torch_core.show_image(im, ax=None, figsize=None, title=None, ctx=None, cmap=None, norm=None, *, aspect=None, interpolation=None, alpha=None, vmin=None, vmax=None, origin=None, extent=None, interpolation_stage=None, filternorm=True, filterrad=4.0, resample=None, url=None, data=None, **kwargs)>
print_source(show_image)
@delegates(plt.Axes.imshow, keep=True, but=['shape', 'imlim'])
def show_image(im, ax=None, figsize=None, title=None, ctx=None, **kwargs):
    "Show a PIL or PyTorch image on `ax`."
    # Handle pytorch axis order
    if hasattrs(im, ('data','cpu','permute')):
        im = im.data.cpu()
        if im.shape[0]<5: im=im.permute(1,2,0)
    elif not isinstance(im,np.ndarray): im=array(im)
    # Handle 1-channel images
    if im.shape[-1]==1: im=im[...,0]

    ax = ifnone(ax,ctx)
    if figsize is None: figsize = (_fig_bounds(im.shape[0]), _fig_bounds(im.shape[1]))
    if ax is None: _,ax = plt.subplots(figsize=figsize)
    ax.imshow(im, **kwargs)
    if title is not None: ax.set_title(title)
    ax.axis('off')
    return ax
im3_t = tensor(im3)
im3_t[0:3,0:3] * top_edge
tensor([[-0., -0., -0.],
        [0., 0., 0.],
        [0., 0., 0.]])
(im3_t[0:3,0:3] * top_edge).sum()
tensor(0.)
df = pd.DataFrame(im3_t[:10,:20])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')
  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 0 0 0 12 99 91 142 155 246 182 155 155 155 155 131 52 0 0 0 0
6 0 0 0 138 254 254 254 254 254 254 254 254 254 254 254 252 210 122 33 0
7 0 0 0 220 254 254 254 235 189 189 189 189 150 189 205 254 254 254 75 0
8 0 0 0 35 74 35 35 25 0 0 0 0 0 0 13 224 254 254 153 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 90 254 254 247 53 0
df = pd.DataFrame(im3_t[4:7,6:9])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')
  0 1 2
0 0 0 0
1 142 155 246
2 254 254 254
(im3_t[4:7,6:9] * top_edge).sum()
tensor(762.)

Note: Returns a high number because the $3x3$ pixel square represents a top edge.

df = pd.DataFrame(im3_t[7:10,17:20])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')
  0 1 2
0 254 75 0
1 254 153 0
2 247 53 0
(im3_t[7:10,17:20] * top_edge).sum()
tensor(-29.)

Note: Returns a low number because the $3x3$ pixel square does not represent a top edge.

# Center coords of the 3x3 matrix will be (row,col)
def apply_kernel(row, col, kernel):
    return (im3_t[row-1:row+2,col-1:col+2] * kernel).sum()
apply_kernel(5,7,top_edge)
tensor(762.)

Mapping a Convolution Kernel

# Nested list comprehension to generate a list of coordinates
[[(i,j) for j in range(1,5)] for i in range(1,5)]
[[(1, 1), (1, 2), (1, 3), (1, 4)],
 [(2, 1), (2, 2), (2, 3), (2, 4)],
 [(3, 1), (3, 2), (3, 3), (3, 4)],
 [(4, 1), (4, 2), (4, 3), (4, 4)]]
rng = range(1,27)
# Map top edge kernel to the generated list of coordinates
top_edge3 = tensor([[apply_kernel(i,j,top_edge) for j in rng] for i in rng])

show_image(top_edge3);

png

Note: Top edges are black and bottom edges are white.

left_edge = tensor([[-1,1,0],
                    [-1,1,0],
                    [-1,1,0]]).float()

left_edge3 = tensor([[apply_kernel(i,j,left_edge) for j in rng] for i in rng])

show_image(left_edge3);

png

right_edge = tensor([[0,1,-1],
                     [0,1,-1],
                     [0,1,-1]]).float()

right_edge3 = tensor([[apply_kernel(i,j,right_edge) for j in rng] for i in rng])

show_image(right_edge3);

png

bottom_edge = tensor([[0,0,0],
                      [1,1,1],
                      [-1,-1,-1]]).float()

bottom_edge3 = tensor([[apply_kernel(i,j,bottom_edge) for j in rng] for i in rng])

show_image(bottom_edge3);

png

Convolutions in PyTorch

diag1_edge = tensor([[ 0,-1, 1],
                     [-1, 1, 0],
                     [ 1, 0, 0]]).float()
diag2_edge = tensor([[ 1,-1, 0],
                     [ 0, 1,-1],
                     [ 0, 0, 1]]).float()

edge_kernels = torch.stack([left_edge, right_edge, top_edge, bottom_edge, diag1_edge, diag2_edge])
edge_kernels.shape
torch.Size([6, 3, 3])
print_source(first)
def first(x, f=None, negate=False, **kwargs):
    "First element of `x`, optionally filtered by `f`, or None if missing"
    x = iter(x)
    if f: x = filter_ex(x, f=f, negate=negate, gen=True, **kwargs)
    return next(x, None)
mnist = DataBlock((ImageBlock(cls=PILImageBW), CategoryBlock), 
                  get_items=get_image_files, 
                  splitter=GrandparentSplitter(),
                  get_y=parent_label)

dls = mnist.dataloaders(path)
xb,yb = first(dls.valid)
xb.shape
torch.Size([64, 1, 28, 28])
# Move to CPU
xb,yb = to_cpu(xb),to_cpu(yb)
edge_kernels.shape,edge_kernels.unsqueeze(1).shape
(torch.Size([6, 3, 3]), torch.Size([6, 1, 3, 3]))
edge_kernels = edge_kernels.unsqueeze(1)
edge_kernels
tensor([[[[[-1.,  1.,  0.],
           [-1.,  1.,  0.],
           [-1.,  1.,  0.]]]],

    
    
            [[[[ 0.,  1., -1.],
               [ 0.,  1., -1.],
               [ 0.,  1., -1.]]]],


    
    
            [[[[-1., -1., -1.],
               [ 0.,  0.,  0.],
               [ 1.,  1.,  1.]]]],


    
    
            [[[[ 0.,  0.,  0.],
               [ 1.,  1.,  1.],
               [-1., -1., -1.]]]],


    
    
            [[[[ 0., -1.,  1.],
               [-1.,  1.,  0.],
               [ 1.,  0.,  0.]]]],


    
    
            [[[[ 1., -1.,  0.],
               [ 0.,  1., -1.],
               [ 0.,  0.,  1.]]]]])
batch_features = F.conv2d(xb, edge_kernels)
batch_features.shape
torch.Size([64, 6, 26, 26])
help(F.conv2d)
Help on built-in function conv2d:

conv2d(...)
    conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
    
    Applies a 2D convolution over an input image composed of several input
    planes.
    
    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
    
    See :class:`~torch.nn.Conv2d` for details and output shape.
    
    Note:
        In some circumstances when given tensors on a CUDA device and using CuDNN, this operator may select a nondeterministic algorithm to increase performance. If this is undesirable, you can try to make the operation deterministic (potentially at a performance cost) by setting ``torch.backends.cudnn.deterministic = True``. See :doc:`/notes/randomness` for more information.



        
        Args:
            input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
            weight: filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW)`
            bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
            stride: the stride of the convolving kernel. Can be a single number or a
              tuple `(sH, sW)`. Default: 1
            padding: implicit paddings on both sides of the input. Can be a string {'valid', 'same'},
              single number or a tuple `(padH, padW)`. Default: 0
              ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
              the input so the output has the shape as the input. However, this mode
              doesn't support any stride values other than 1.
        

          .. warning::
              For ``padding='same'``, if the ``weight`` is even-length and
              ``dilation`` is odd in any dimension, a full :func:`pad` operation
              may be needed internally. Lowering performance.
    
        dilation: the spacing between kernel elements. Can be a single number or
          a tuple `(dH, dW)`. Default: 1
        groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
          number of groups. Default: 1
    
    Examples::
    
        >>> # With square kernels and equal stride
        >>> filters = torch.randn(8, 4, 3, 3)
        >>> inputs = torch.randn(1, 4, 5, 5)
        >>> F.conv2d(inputs, filters, padding=1)
for i in range(6):
    show_image(batch_features[0,i]);

png

png

png

png

png

png

Strides and Padding

  • appropriate padding ensures the output activation map is the same size as the original image
  • the necessary padding for an $ksxks$ size kernel (where $ks$ is an odd number) is ks//2
    • almost never use even size kernels

Stride

  • the amount of pixels the kernel moves across the image at each step
  • stride-1 convolutions (with appropriate padding) maintain the same image size
  • stride-2 convolutions are usefult for reducing the size of the output

Understanding the Convolution Equations

  • CNNs from different viewpoints
    • shows different visualizations for convolutions
  • A convolution can be represented as a special kind of matrix multiplication with two constraints
    1. some elements are always zero
    2. some elements are forced to have the same value
  • These constraints enforce a certain pattern of connectivity

Our First Convolutional Neural Network

  • the kernels for the convolutions are learned during training
    • the model will learn what features are useful for classification

Creating the CNN

simple_net = nn.Sequential(
    nn.Linear(28*28,30),
    nn.ReLU(),
    nn.Linear(30,1)
)
simple_net
Sequential(
  (0): Linear(in_features=784, out_features=30, bias=True)
  (1): ReLU()
  (2): Linear(in_features=30, out_features=1, bias=True)
)
broken_cnn = sequential(
    nn.Conv2d(1,30, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Conv2d(30,1, kernel_size=3, padding=1)
)
broken_cnn
Sequential(
  (0): Conv2d(1, 30, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): Conv2d(30, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
broken_cnn(xb).shape
torch.Size([64, 1, 28, 28])

Note: We don’t need to specify the input dimensions for convolutional layers because they are automatically applied over each pixel

Note: We can use stride-2 convolutions to progressively decrease the size down to a single output for classification.

  • It is common to increase the number of features at the same time, to maintain the same amount of computation
def conv(ni, nf, ks=3, act=True):
    res = nn.Conv2d(ni, nf, stride=2, kernel_size=ks, padding=ks//2)
    if act: res = nn.Sequential(res, nn.ReLU())
    return res
simple_cnn = sequential(
    conv(1 ,4),            #14x14
    conv(4 ,8),            #7x7
    conv(8 ,16),           #4x4
    conv(16,32),           #2x2
    conv(32,2, act=False), #1x1
    # Flatten output to a single dimension
    Flatten(),
)
simple_cnn
Sequential(
  (0): Sequential(
    (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (1): Sequential(
    (0): Conv2d(4, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (2): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (3): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (4): Conv2d(32, 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (5): Flatten(full=False)
)
simple_cnn(xb).shape
torch.Size([64, 2])
learn = Learner(dls, simple_cnn, loss_func=F.cross_entropy, metrics=accuracy)
learn.summary()
Sequential (Input shape: 64 x 1 x 28 x 28)
============================================================================
Layer (type)         Output Shape         Param #    Trainable 
============================================================================
                     64 x 4 x 14 x 14    
Conv2d                                    40         True      
ReLU                                                           
____________________________________________________________________________
                     64 x 8 x 7 x 7      
Conv2d                                    296        True      
ReLU                                                           
____________________________________________________________________________
                     64 x 16 x 4 x 4     
Conv2d                                    1168       True      
ReLU                                                           
____________________________________________________________________________
                     64 x 32 x 2 x 2     
Conv2d                                    4640       True      
ReLU                                                           
____________________________________________________________________________
                     64 x 2 x 1 x 1      
Conv2d                                    578        True      
____________________________________________________________________________
                     64 x 2              
Flatten                                                        
____________________________________________________________________________

Total params: 6,722
Total trainable params: 6,722
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7f576a7d3430>
Loss function: <function cross_entropy at 0x7f57b69003a0>

Callbacks:
  - TrainEvalCallback
  - Recorder
  - ProgressCallback
learn.fit_one_cycle(2, 0.01)
epoch train_loss valid_loss accuracy time
0 0.063063 0.045171 0.987242 00:02
1 0.023533 0.026628 0.991168 00:01

Understanding Convolution Arithmetic

m = learn.model[0]
m
Sequential(
  (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (1): ReLU()
)

1 input channel, four output channels, and a 3x3 kernel

m[0].weight.shape
torch.Size([4, 1, 3, 3])
4*1*3*3
36
m[0].bias.shape
torch.Size([4])

Receptive Fields

  • the area of an image that is involved in the calculation of a layer

A Note About Twitter

  • Many of the top people in deep learning today are Twitter regulars
  • One of the main ways to stay up to date with interesting papers, software releases, and other deep learning news

Color Images

  • a color image is a rank-3 tensor
  • we don’t use the same convolutional kernel for all three color channels
  • kernel has a size of ch_in x 3 x 3 where ch_in is the number of input channels (e.g. 3 for RGB)
image2tensor
<function fastai.vision.core.image2tensor(img)>
print_source(image2tensor)
def image2tensor(img):
    "Transform image to byte tensor in `c*h*w` dim order."
    res = tensor(img)
    if res.dim()==2: res = res.unsqueeze(-1)
    return res.permute(2,0,1)

(<function fastai.vision.core.image2tensor(img)>, None)
im = image2tensor(Image.open(image_bear()))
im.shape
torch.Size([3, 1000, 846])
show_image(im);

png

_,axs = subplots(1,3)
for bear,ax,color in zip(im,axs,('Reds','Greens','Blues')):
    show_image(255-bear, ax=ax, cmap=color)

png

Improving Training Stability

path = untar_data(URLs.MNIST)
path
Path('/home/innom-dt/.fastai/data/mnist_png')
path.ls()
(#2) [Path('/home/innom-dt/.fastai/data/mnist_png/testing'),Path('/home/innom-dt/.fastai/data/mnist_png/training')]
Path(path/'training').ls()
(#10) [Path('/home/innom-dt/.fastai/data/mnist_png/training/2'),Path('/home/innom-dt/.fastai/data/mnist_png/training/4'),Path('/home/innom-dt/.fastai/data/mnist_png/training/1'),Path('/home/innom-dt/.fastai/data/mnist_png/training/6'),Path('/home/innom-dt/.fastai/data/mnist_png/training/5'),Path('/home/innom-dt/.fastai/data/mnist_png/training/9'),Path('/home/innom-dt/.fastai/data/mnist_png/training/3'),Path('/home/innom-dt/.fastai/data/mnist_png/training/0'),Path('/home/innom-dt/.fastai/data/mnist_png/training/8'),Path('/home/innom-dt/.fastai/data/mnist_png/training/7')]
def get_dls(bs=64):
    return DataBlock(
        blocks=(ImageBlock(cls=PILImageBW), CategoryBlock), 
        get_items=get_image_files, 
        splitter=GrandparentSplitter('training','testing'),
        get_y=parent_label,
        batch_tfms=Normalize()
    ).dataloaders(path, bs=bs)

dls = get_dls()
dls.show_batch(max_n=9, figsize=(4,4))

png

A Simple Baseline

  • more convolutional filters are likely required since there are more numbers to recognize
  • it is important to keep the number of filters smaller than the number of pixels in the kernel size
    • this forces the neural network to extract useful features
def conv(ni, nf, ks=3, act=True):
    res = nn.Conv2d(ni, nf, stride=2, kernel_size=ks, padding=ks//2)
    if act: res = nn.Sequential(res, nn.ReLU())
    return res
def simple_cnn():
    return sequential(
        # Increate starting kernel size and number of filters
        conv(1 ,8, ks=5),        #14x14
        conv(8 ,16),             #7x7
        conv(16,32),             #4x4
        conv(32,64),             #2x2
        conv(64,10, act=False),  #1x1
        Flatten(),
    )
from fastai.callback.hook import *
def fit(epochs=1):
    learn = Learner(dls, simple_cnn(), loss_func=F.cross_entropy,
                    metrics=accuracy, cbs=ActivationStats(with_hist=True))
    learn.fit(epochs, 0.06)
    return learn

fastai ActivationStats

  • provides som handy utilities for plotting the activations during training
ActivationStats
fastai.callback.hook.ActivationStats
print_source(ActivationStats)
@delegates()
class ActivationStats(HookCallback):
    "Callback that record the mean and std of activations."
    order=-20
    def __init__(self, with_hist=False, **kwargs):
        super().__init__(**kwargs)
        self.with_hist = with_hist

    def before_fit(self):
        "Initialize stats."
        super().before_fit()
        self.stats = L()

    def hook(self, m, i, o):
        if isinstance(o, tuple): return self.hook_multi_ouput(o)
        o = o.float()
        res = {'mean': o.mean().item(), 'std': o.std().item(),
               'near_zero': (o<=0.05).long().sum().item()/o.numel()}
        if self.with_hist: res['hist'] = o.histc(40,0,10)
        return res

    def hook_multi_ouput(self,o_tuple):
        "For outputs of RNN which are [nested] tuples of tensors"
        res = []
        for o in self._flatten_tuple(o_tuple):
            if not(isinstance(o, Tensor)): continue
            res.append(self.hook(None, None, o))
        return res

    def _flatten_tuple(self, o_tuple):
        "Recursively flatten a [nested] tuple"
        res = []
        for it in o_tuple:
            if isinstance(it, tuple): res += self._flatten_tuple(it)
            else: res += [it]
        return tuple(res)

    def after_batch(self):
        "Take the stored results and puts it in `self.stats`"
        if self.training and (self.every is None or self.train_iter%self.every == 0):
            self.stats.append(self.hooks.stored)
        super().after_batch()

    def layer_stats(self, idx):
        lstats = self.stats.itemgot(idx)
        return L(lstats.itemgot(o) for o in ('mean','std','near_zero'))

    def hist(self, idx):
        res = self.stats.itemgot(idx).itemgot('hist')
        return torch.stack(tuple(res)).t().float().log1p()

    def color_dim(self, idx, figsize=(10,5), ax=None):
        "The 'colorful dimension' plot"
        res = self.hist(idx)
        if ax is None: ax = subplots(figsize=figsize)[1][0]
        ax.imshow(res, origin='lower')
        ax.axis('off')

    def plot_layer_stats(self, idx):
        _,axs = subplots(1, 3, figsize=(12,3))
        for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
            ax.plot(o)
            ax.set_title(title)
learn = fit()
epoch train_loss valid_loss accuracy time
0 0.617736 0.533550 0.831100 00:08
learn.activation_stats.plot_layer_stats(0)

png

Note: Generally, the model should have a consisten (or at least smooth) mean and standard deviation of layer activations during training.

  • Activations near zero indicate we have computation in the model that is doing nothing at all
    • zeros in one layer generally carry over to the next layer, which will then create more zeros
# The penultimate layer
learn.activation_stats.plot_layer_stats(-2)

png

Note: The problems got wors toward the end of the network.

Increase Batch Size

  • a larger batch size can make training more stable
  • larger batches have more accurate gradients, since they are calculated from more data
  • larger batch sizes mean fewer batches per epoch, meaning fewer opportunities for your model to update weights
dls = get_dls(512)
learn = fit()
epoch train_loss valid_loss accuracy time
0 0.444612 0.259085 0.916200 00:05
learn.activation_stats.plot_layer_stats(-2)

png

Note: Still a high number of activations near zero.

1cycle Training

  • it is dangerous to begin training with a high learning rate as the initial random weights are not well suited to the target task
  • don’t want to end with a high learning rate either
  • want to start with a smaller learning rate, then gradually increase it, then gradually decrease it again towards the end of training
  • Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates
    • designed a schedule for learning rate separated into two phases
      1. warmup: the learning rate grows from the minimum value to the maximum value
      2. annealing: the learning rate decreases back to the minimum value
  • 1cycle training allows us to use higher learning rates
    • allows us to train faster and reduces
    • results in less overfitting
      • we skip over sharp local minima in the loss landscape
      • we end up in a smoother, more generalizable part of the loss landscape
  • a model that generalizes well is one whose loss would not change much if you changed the input a little

Momentum

def fit(epochs=1, lr=0.06):
    learn = Learner(dls, simple_cnn(), loss_func=F.cross_entropy,
                    metrics=accuracy, cbs=ActivationStats(with_hist=True))
    learn.fit_one_cycle(epochs, lr)
    return learn

fastai fit_one_cycle

  • uses cosine annealing instead of linear annealing
  • lr_max: the highest learning rate that will be used during training
    • single number for all layers
    • a list specifying learning rates for each layer group
    • a Python slice object containing learning rates for the first and last layer group
  • div: How much to divide lr_max by to get the starting learning rate
  • div_final: How much to divide lr_max by to get the ending learning rate
  • pct_start: What percentage of the batches to use for warmup
  • moms: a tuple (mom1,mom2,mom3)
    • mom1: the initial momentum
    • mom2: the minimum momentum
    • mom3: the final momentum
Learner.fit_one_cycle
<function fastai.callback.schedule.Learner.fit_one_cycle(self: fastai.learner.Learner, n_epoch, lr_max=None, div=25.0, div_final=100000.0, pct_start=0.25, wd=None, moms=None, cbs=None, reset_opt=False)>
print_source(Learner.fit_one_cycle)
@patch
def fit_one_cycle(self:Learner, n_epoch, lr_max=None, div=25., div_final=1e5, pct_start=0.25, wd=None,
                  moms=None, cbs=None, reset_opt=False):
    "Fit `self.model` for `n_epoch` using the 1cycle policy."
    if self.opt is None: self.create_opt()
    self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max)
    lr_max = np.array([h['lr'] for h in self.opt.hypers])
    scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
              'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
print_source(ParamScheduler)
@docs
class ParamScheduler(Callback):
    "Schedule hyper-parameters according to `scheds`"
    order,run_valid = 60,False

    def __init__(self, scheds): self.scheds = scheds
    def before_fit(self): self.hps = {p:[] for p in self.scheds.keys()}
    def before_batch(self): self._update_val(self.pct_train)

    def _update_val(self, pct):
        for n,f in self.scheds.items(): self.opt.set_hyper(n, f(pct))

    def after_batch(self):
        for p in self.scheds.keys(): self.hps[p].append(self.opt.hypers[-1][p])

    def after_fit(self):
        if hasattr(self.learn, 'recorder') and hasattr(self, 'hps'): self.recorder.hps = self.hps

    _docs = {"before_fit": "Initialize container for hyper-parameters",
             "before_batch": "Set the proper hyper-parameters in the optimizer",
             "after_batch": "Record hyper-parameters of this batch",
             "after_fit": "Save the hyper-parameters in the recorder if there is one"}
learn = fit()
epoch train_loss valid_loss accuracy time
0 0.191914 0.067390 0.979200 00:05
learn.recorder.plot_sched()

png

learn.recorder
Recorder
Recorder
fastai.learner.Recorder

fastai Recorder

  • Documentaion
  • records everything that happens during training including
    • losses
    • metrics
    • hyperparameters
      • learning rate
      • momentum
print_source(Recorder)
class Recorder(Callback):
    "Callback that registers statistics (lr, loss and metrics) during training"
    _stateattrs=('lrs','iters','losses','values')
    remove_on_fetch,order = True,50

    def __init__(self, add_time=True, train_metrics=False, valid_metrics=True, beta=0.98):
        store_attr('add_time,train_metrics,valid_metrics')
        self.loss,self.smooth_loss = AvgLoss(),AvgSmoothLoss(beta=beta)

    def before_fit(self):
        "Prepare state for training"
        self.lrs,self.iters,self.losses,self.values = [],[],[],[]
        names = self.metrics.attrgot('name')
        if self.train_metrics and self.valid_metrics:
            names = L('loss') + names
            names = names.map('train_{}') + names.map('valid_{}')
        elif self.valid_metrics: names = L('train_loss', 'valid_loss') + names
        else: names = L('train_loss') + names
        if self.add_time: names.append('time')
        self.metric_names = 'epoch'+names
        self.smooth_loss.reset()

    def after_batch(self):
        "Update all metrics and records lr and smooth loss in training"
        if len(self.yb) == 0: return
        mets = self._train_mets if self.training else self._valid_mets
        for met in mets: met.accumulate(self.learn)
        if not self.training: return
        self.lrs.append(self.opt.hypers[-1]['lr'])
        self.losses.append(self.smooth_loss.value)
        self.learn.smooth_loss = self.smooth_loss.value

    def before_epoch(self):
        "Set timer if `self.add_time=True`"
        self.cancel_train,self.cancel_valid = False,False
        if self.add_time: self.start_epoch = time.time()
        self.log = L(getattr(self, 'epoch', 0))

    def before_train   (self): self._train_mets[1:].map(Self.reset())
    def before_validate(self): self._valid_mets.map(Self.reset())
    def after_train   (self): self.log += self._train_mets.map(_maybe_item)
    def after_validate(self): self.log += self._valid_mets.map(_maybe_item)
    def after_cancel_train(self):    self.cancel_train = True
    def after_cancel_validate(self): self.cancel_valid = True

    def after_epoch(self):
        "Store and log the loss/metric values"
        self.learn.final_record = self.log[1:].copy()
        self.values.append(self.learn.final_record)
        if self.add_time: self.log.append(format_time(time.time() - self.start_epoch))
        self.logger(self.log)
        self.iters.append(self.smooth_loss.count)

    @property
    def _train_mets(self):
        if getattr(self, 'cancel_train', False): return L()
        return L(self.smooth_loss) + (self.metrics if self.train_metrics else L())

    @property
    def _valid_mets(self):
        if getattr(self, 'cancel_valid', False): return L()
        return (L(self.loss) + self.metrics if self.valid_metrics else L())

    def plot_loss(self, skip_start=5, with_valid=True):
        plt.plot(list(range(skip_start, len(self.losses))), self.losses[skip_start:], label='train')
        if with_valid:
            idx = (np.array(self.iters)<skip_start).sum()
            valid_col = self.metric_names.index('valid_loss') - 1
            plt.plot(self.iters[idx:], L(self.values[idx:]).itemgot(valid_col), label='valid')
            plt.legend()
Recorder.plot_sched
<function fastai.callback.schedule.Recorder.plot_sched(self: fastai.learner.Recorder, keys=None, figsize=None)>
print_source(Recorder.plot_sched)
@patch
def plot_sched(self:Recorder, keys=None, figsize=None):
    keys = self.hps.keys() if keys is None else L(keys)
    rows,cols = (len(keys)+1)//2, min(2, len(keys))
    figsize = figsize or (6*cols,4*rows)
    _, axs = plt.subplots(rows, cols, figsize=figsize)
    axs = axs.flatten() if len(keys) > 1 else L(axs)
    for p,ax in zip(keys, axs):
        ax.plot(self.hps[p])
        ax.set_ylabel(p)
learn.activation_stats.plot_layer_stats(-2)

png

Note: The percentage of non-zero weight is better, but still high.

learn.activation_stats
ActivationStats
print_source(ActivationStats)
@delegates()
class ActivationStats(HookCallback):
    "Callback that record the mean and std of activations."
    order=-20
    def __init__(self, with_hist=False, **kwargs):
        super().__init__(**kwargs)
        self.with_hist = with_hist

    def before_fit(self):
        "Initialize stats."
        super().before_fit()
        self.stats = L()

    def hook(self, m, i, o):
        if isinstance(o, tuple): return self.hook_multi_ouput(o)
        o = o.float()
        res = {'mean': o.mean().item(), 'std': o.std().item(),
               'near_zero': (o<=0.05).long().sum().item()/o.numel()}
        if self.with_hist: res['hist'] = o.histc(40,0,10)
        return res

    def hook_multi_ouput(self,o_tuple):
        "For outputs of RNN which are [nested] tuples of tensors"
        res = []
        for o in self._flatten_tuple(o_tuple):
            if not(isinstance(o, Tensor)): continue
            res.append(self.hook(None, None, o))
        return res

    def _flatten_tuple(self, o_tuple):
        "Recursively flatten a [nested] tuple"
        res = []
        for it in o_tuple:
            if isinstance(it, tuple): res += self._flatten_tuple(it)
            else: res += [it]
        return tuple(res)

    def after_batch(self):
        "Take the stored results and puts it in `self.stats`"
        if self.training and (self.every is None or self.train_iter%self.every == 0):
            self.stats.append(self.hooks.stored)
        super().after_batch()

    def layer_stats(self, idx):
        lstats = self.stats.itemgot(idx)
        return L(lstats.itemgot(o) for o in ('mean','std','near_zero'))

    def hist(self, idx):
        res = self.stats.itemgot(idx).itemgot('hist')
        return torch.stack(tuple(res)).t().float().log1p()

    def color_dim(self, idx, figsize=(10,5), ax=None):
        "The 'colorful dimension' plot"
        res = self.hist(idx)
        if ax is None: ax = subplots(figsize=figsize)[1][0]
        ax.imshow(res, origin='lower')
        ax.axis('off')

    def plot_layer_stats(self, idx):
        _,axs = subplots(1, 3, figsize=(12,3))
        for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
            ax.plot(o)
            ax.set_title(title)

fastai color_dim

  • Detailed Explanation
  • developed with fast.ai student Stefano Giomo
  • express with colors the mean and standard deviation of activations for each batch during training
  • vertical axis represents a group (bin) of activation values
  • each column in the horizontal axis is a batch
  • the colors represent how many activations for that batch have a value in that bin
# Set matplotlib color map
matplotlib.rcParams['image.cmap'] = 'viridis'
learn.activation_stats.color_dim(-2)

png

Note: This shows the classic picture of “bad training”:

  • Starts with nearly all activations at zero
  • The number of nonzero activations increases exponentially over the first few batches
  • Then it goes to far and collapses with most activations returning to zero or near-zero
  • This cycle repeats a few times before we see a spread of activations throughout the range
  • This can be addressed with batch normalization

Batch Normalization

  • take the average of the mean and standard deviations of the activations of a layer and use those to normalize that activations
    • this by itself can cause problems if the network wants some activations to be really high in order to make accurate predictions
      • resolved by adding two learnable parameters, gamma and beta
        • gamma*y + beta where y is a vector of normalize activations
  • Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
    • training deep neural networks is complicated by the fact that the distribution of each layer’s inputs changes during training, as parameters of the previous layers change
      • called covariate shift
      • slows down training by requiring lower learning rates and careful parameter initialization
      • resolved by normalizing layer inputs (each mini-batch)
  • batch normalization allows much higher learning rates and is less sensitive to parameter initialization

  • Different behavior during training and validation
    • training: use the mean and standard deviation of the batch to normalize the data
    • validation: use a running mean of the statistics calculated during training
  • models with batch normalization layers tend to generalize better

$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$

nn.BatchNorm2d
torch.nn.modules.batchnorm.BatchNorm2d
print_source(nn.BatchNorm2d)
class BatchNorm2d(_BatchNorm):
    r"""Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    """

    def _check_input_dim(self, input):
        if input.dim() != 4:
            raise ValueError("expected 4D input (got {}D input)".format(input.dim()))
def conv(ni, nf, ks=3, act=True):
    layers = [nn.Conv2d(ni, nf, stride=2, kernel_size=ks, padding=ks//2)]
    layers.append(nn.BatchNorm2d(nf))
    if act: layers.append(nn.ReLU())
    return nn.Sequential(*layers)
learn = fit()
epoch train_loss valid_loss accuracy time
0 0.135761 0.058451 0.985700 00:06
learn.activation_stats.color_dim(-4)

png

Note: Shows a smooth development of activations, with no crashes.

# Try training for longer and at a higher learning rate
learn = fit(5, lr=0.1)
epoch train_loss valid_loss accuracy time
0 0.185239 0.153986 0.951600 00:06
1 0.083529 0.110004 0.965800 00:06
2 0.052301 0.048957 0.984400 00:07
3 0.034640 0.032938 0.988600 00:06
4 0.017389 0.024644 0.991700 00:06
learn = fit(5, lr=0.1)
epoch train_loss valid_loss accuracy time
0 0.187077 0.099310 0.969900 00:06
1 0.077691 0.089945 0.972400 00:06
2 0.050960 0.061807 0.980500 00:06
3 0.033020 0.030316 0.989600 00:06
4 0.017050 0.023186 0.992000 00:06

References