1. VGG block

  • 3x3 Conv, pad 1 (n layers, m channels usually double, ReLU)
  • 2x2 MaxPool, stride 2 (half size per block)

It turns out that ‘deeper 3x3 Conv’ is better than ‘5x5 Conv’.

2. Architecture

  • multiple VGG blocks
  • Dense (4096) (Flatten, Linear, ReLU, Dropout)
  • Dense (4096) (Linear, ReLU, Dropout)
  • Dense (1000)

3. Code

import torch 
from torch import nn

def vgg_block(num_conv,in_channels,out_channels) ->nn.Sequential:
    layers: List[nn.Module] = []
    for _ in range(num_conv):
        layers.append(nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1))
        layers.append(nn.ReLU())
        in_channels = out_channels
    layers.append(nn.MaxPool2d(kernel_size=2,stride=2))
    return nn.Sequential(*layers)

class vgg_net(nn.Module):
    def __init__(self,cfg:tuple,in_channels) -> None:
        super(vgg_net,self).__init__()
        layers: List[nn.Module] = []
        for num_conv,out_channels in cfg:
            layers.append(vgg_block(num_conv,in_channels,out_channels))
            in_channels = out_channels
        self.model = nn.Sequential(*layers,
            nn.Flatten(),
            nn.Linear(in_channels * 7 * 7,4096),nn.ReLU(),nn.Dropout(p=0.5),
            nn.Linear(4096,4096),nn.ReLU(),nn.Dropout(p=0.5),
            nn.Linear(4096,10)
        )
    def forward(self,X:torch.Tensor):
        for layer in self.model:
            X=layer(X)
            print(layer.__class__.__name__,"output shape:\t",X.shape)
        print(X)
        return nn.Softmax(1)(X)     # Classification

X=torch.rand(size=(1,3,224,224),dtype=torch.float32)
cfg = ((3,64),(1,128),(2,256),(2,512),(2,512))
net = vgg_net(cfg,in_channels=3)
net(X)