According to arXiv:2301.00808 [cs.CV]
Original implementation in https://github.com/facebookresearch/ConvNeXt-V2
Imports
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
import ignite.metrics
import ignite.contrib.handlers
Configuration
DATA_DIR='./data'
IMAGE_SIZE = 32
NUM_CLASSES = 10
NUM_WORKERS = 8
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-1
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("device:", DEVICE)
device: cuda
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(IMAGE_SIZE, padding=4),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.ToTensor()
])
train_dset = datasets.CIFAR10(root=DATA_DIR, train=True, download=True, transform=train_transform)
test_dset = datasets.CIFAR10(root=DATA_DIR, train=False, download=True, transform=transforms.ToTensor())
Files already downloaded and verified Files already downloaded and verified
def dataset_show_image(dset, idx):
X, Y = dset[idx]
title = "Ground truth: {}".format(dset.classes[Y])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_axis_off()
ax.imshow(np.moveaxis(X.numpy(), 0, -1))
ax.set_title(title)
plt.show()
dataset_show_image(test_dset, 1)
train_loader = torch.utils.data.DataLoader(train_dset, batch_size=BATCH_SIZE, shuffle=True,
num_workers=NUM_WORKERS, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_dset, batch_size=BATCH_SIZE, shuffle=False,
num_workers=NUM_WORKERS, pin_memory=True)
Utilities
class LayerNormChannels(nn.Module):
def __init__(self, channels):
super().__init__()
self.norm = nn.LayerNorm(channels)
def forward(self, x):
x = x.transpose(1, -1)
x = self.norm(x)
x = x.transpose(-1, 1)
return x
class Residual(nn.Module):
def __init__(self, *layers):
super().__init__()
self.residual = nn.Sequential(*layers)
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
return x + self.gamma * self.residual(x)
GRN (Global Response Normalization) layer
class GRN(nn.Module):
def __init__(self, channels):
super().__init__()
self.gamma = nn.Parameter(torch.zeros(1, channels, 1, 1))
self.beta = nn.Parameter(torch.zeros(1, channels, 1, 1))
def forward(self, x):
Gx = torch.norm(x, p=2, dim=(2,3), keepdim=True)
Nx = Gx / (Gx.mean(dim=1, keepdim=True) + 1e-6)
return self.gamma * (x * Nx) + self.beta + x
ConvNeXtV2 stages
class ConvNeXtV2Block(Residual):
def __init__(self, channels, kernel_size, mult=4, p_drop=0.):
padding = (kernel_size - 1) // 2
hidden_channels = channels * mult
super().__init__(
nn.Conv2d(channels, channels, kernel_size, padding=padding, groups=channels),
LayerNormChannels(channels),
nn.Conv2d(channels, hidden_channels, 1),
nn.GELU(),
GRN(hidden_channels),
nn.Conv2d(hidden_channels, channels, 1),
nn.Dropout(p_drop)
)
class DownsampleBlock(nn.Sequential):
def __init__(self, in_channels, out_channels, stride=2):
super().__init__(
LayerNormChannels(in_channels),
nn.Conv2d(in_channels, out_channels, stride, stride=stride)
)
class Stage(nn.Sequential):
def __init__(self, in_channels, out_channels, num_blocks, kernel_size, p_drop=0.):
layers = [] if in_channels == out_channels else [DownsampleBlock(in_channels, out_channels)]
layers += [ConvNeXtV2Block(out_channels, kernel_size, p_drop=p_drop) for _ in range(num_blocks)]
super().__init__(*layers)
class ConvNeXtV2Body(nn.Sequential):
def __init__(self, in_channels, channel_list, num_blocks_list, kernel_size, p_drop=0.):
layers = []
for out_channels, num_blocks in zip(channel_list, num_blocks_list):
layers.append(Stage(in_channels, out_channels, num_blocks, kernel_size, p_drop))
in_channels = out_channels
super().__init__(*layers)
Main model
class Stem(nn.Sequential):
def __init__(self, in_channels, out_channels, patch_size):
super().__init__(
nn.Conv2d(in_channels, out_channels, patch_size, stride=patch_size),
LayerNormChannels(out_channels)
)
class Head(nn.Sequential):
def __init__(self, in_channels, classes):
super().__init__(
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.LayerNorm(in_channels),
nn.Linear(in_channels, classes)
)
class ConvNeXtV2(nn.Sequential):
def __init__(self, classes, channel_list, num_blocks_list, kernel_size, patch_size,
in_channels=3, res_p_drop=0.):
super().__init__(
Stem(in_channels, channel_list[0], patch_size),
ConvNeXtV2Body(channel_list[0], channel_list, num_blocks_list, kernel_size, res_p_drop),
Head(channel_list[-1], classes)
)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Linear, nn.Conv2d)):
nn.init.normal_(m.weight, std=0.02)
if m.bias is not None: nn.init.zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.weight, 1.)
nn.init.zeros_(m.bias)
elif isinstance(m, Residual):
nn.init.zeros_(m.gamma)
elif isinstance(m, GRN):
nn.init.zeros_(m.gamma)
nn.init.zeros_(m.beta)
def separate_parameters(self):
parameters_decay = set()
parameters_no_decay = set()
modules_weight_decay = (nn.Linear, nn.Conv2d)
modules_no_weight_decay = (nn.LayerNorm,)
for m_name, m in self.named_modules():
for param_name, param in m.named_parameters():
full_param_name = f"{m_name}.{param_name}" if m_name else param_name
if isinstance(m, modules_no_weight_decay):
parameters_no_decay.add(full_param_name)
elif param_name.endswith("bias"):
parameters_no_decay.add(full_param_name)
elif isinstance(m, Residual) and param_name.endswith("gamma"):
parameters_no_decay.add(full_param_name)
elif isinstance(m, GRN) and (param_name.endswith("gamma") or param_name.endswith("beta")):
parameters_no_decay.add(full_param_name)
elif isinstance(m, modules_weight_decay):
parameters_decay.add(full_param_name)
# sanity check
assert len(parameters_decay & parameters_no_decay) == 0
assert len(parameters_decay) + len(parameters_no_decay) == len(list(model.parameters()))
return parameters_decay, parameters_no_decay
model = ConvNeXtV2(NUM_CLASSES,
channel_list = [64, 128, 256, 512],
num_blocks_list = [2, 2, 2, 2],
kernel_size=7, patch_size=1,
res_p_drop=0.)
model.to(DEVICE);
print("Number of parameters: {:,}".format(sum(p.numel() for p in model.parameters())))
Number of parameters: 6,391,826
def get_optimizer(model, learning_rate, weight_decay):
param_dict = {pn: p for pn, p in model.named_parameters()}
parameters_decay, parameters_no_decay = model.separate_parameters()
optim_groups = [
{"params": [param_dict[pn] for pn in parameters_decay], "weight_decay": weight_decay},
{"params": [param_dict[pn] for pn in parameters_no_decay], "weight_decay": 0.0},
]
optimizer = optim.AdamW(optim_groups, lr=learning_rate)
return optimizer
loss = nn.CrossEntropyLoss()
optimizer = get_optimizer(model, learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
trainer = create_supervised_trainer(model, optimizer, loss, device=DEVICE)
lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LEARNING_RATE,
steps_per_epoch=len(train_loader), epochs=EPOCHS)
trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda engine: lr_scheduler.step());
ignite.metrics.RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
Evaluator
val_metrics = {"accuracy": ignite.metrics.Accuracy(), "loss": ignite.metrics.Loss(loss)}
evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=DEVICE)
history = defaultdict(list)
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
train_state = engine.state
epoch = train_state.epoch
max_epochs = train_state.max_epochs
train_loss = train_state.metrics["loss"]
history['train loss'].append(train_loss)
evaluator.run(test_loader)
val_metrics = evaluator.state.metrics
val_loss = val_metrics["loss"]
val_acc = val_metrics["accuracy"]
history['val loss'].append(val_loss)
history['val acc'].append(val_acc)
print("{}/{} - train: loss {:.3f}; val: loss {:.3f} accuracy {:.3f}".format(
epoch, max_epochs, train_loss, val_loss, val_acc))
trainer.run(train_loader, max_epochs=EPOCHS);
1/100 - train: loss 1.851; val: loss 1.831 accuracy 0.331 2/100 - train: loss 1.682; val: loss 1.619 accuracy 0.415 3/100 - train: loss 1.553; val: loss 1.524 accuracy 0.447 4/100 - train: loss 1.479; val: loss 1.443 accuracy 0.469 5/100 - train: loss 1.429; val: loss 1.395 accuracy 0.493 6/100 - train: loss 1.358; val: loss 1.309 accuracy 0.527 7/100 - train: loss 1.278; val: loss 1.260 accuracy 0.548 8/100 - train: loss 1.233; val: loss 1.221 accuracy 0.565 9/100 - train: loss 1.126; val: loss 1.073 accuracy 0.613 10/100 - train: loss 1.020; val: loss 0.987 accuracy 0.648 11/100 - train: loss 0.968; val: loss 0.946 accuracy 0.665 12/100 - train: loss 0.860; val: loss 0.868 accuracy 0.689 13/100 - train: loss 0.822; val: loss 0.816 accuracy 0.713 14/100 - train: loss 0.758; val: loss 0.750 accuracy 0.732 15/100 - train: loss 0.719; val: loss 0.690 accuracy 0.758 16/100 - train: loss 0.659; val: loss 0.644 accuracy 0.775 17/100 - train: loss 0.595; val: loss 0.641 accuracy 0.772 18/100 - train: loss 0.608; val: loss 0.589 accuracy 0.794 19/100 - train: loss 0.557; val: loss 0.573 accuracy 0.804 20/100 - train: loss 0.518; val: loss 0.589 accuracy 0.797 21/100 - train: loss 0.514; val: loss 0.599 accuracy 0.790 22/100 - train: loss 0.502; val: loss 0.548 accuracy 0.810 23/100 - train: loss 0.474; val: loss 0.487 accuracy 0.831 24/100 - train: loss 0.454; val: loss 0.487 accuracy 0.833 25/100 - train: loss 0.431; val: loss 0.523 accuracy 0.829 26/100 - train: loss 0.441; val: loss 0.518 accuracy 0.822 27/100 - train: loss 0.459; val: loss 0.464 accuracy 0.842 28/100 - train: loss 0.418; val: loss 0.441 accuracy 0.850 29/100 - train: loss 0.429; val: loss 0.439 accuracy 0.845 30/100 - train: loss 0.416; val: loss 0.433 accuracy 0.852 31/100 - train: loss 0.383; val: loss 0.437 accuracy 0.853 32/100 - train: loss 0.387; val: loss 0.474 accuracy 0.842 33/100 - train: loss 0.355; val: loss 0.482 accuracy 0.833 34/100 - train: loss 0.356; val: loss 0.453 accuracy 0.848 35/100 - train: loss 0.349; val: loss 0.480 accuracy 0.841 36/100 - train: loss 0.350; val: loss 0.416 accuracy 0.858 37/100 - train: loss 0.370; val: loss 0.401 accuracy 0.862 38/100 - train: loss 0.331; val: loss 0.471 accuracy 0.844 39/100 - train: loss 0.338; val: loss 0.433 accuracy 0.851 40/100 - train: loss 0.349; val: loss 0.433 accuracy 0.854 41/100 - train: loss 0.338; val: loss 0.387 accuracy 0.869 42/100 - train: loss 0.309; val: loss 0.400 accuracy 0.869 43/100 - train: loss 0.312; val: loss 0.419 accuracy 0.865 44/100 - train: loss 0.310; val: loss 0.432 accuracy 0.856 45/100 - train: loss 0.272; val: loss 0.414 accuracy 0.864 46/100 - train: loss 0.252; val: loss 0.393 accuracy 0.871 47/100 - train: loss 0.273; val: loss 0.430 accuracy 0.860 48/100 - train: loss 0.262; val: loss 0.385 accuracy 0.875 49/100 - train: loss 0.254; val: loss 0.396 accuracy 0.874 50/100 - train: loss 0.247; val: loss 0.394 accuracy 0.869 51/100 - train: loss 0.237; val: loss 0.368 accuracy 0.878 52/100 - train: loss 0.227; val: loss 0.432 accuracy 0.867 53/100 - train: loss 0.269; val: loss 0.376 accuracy 0.874 54/100 - train: loss 0.226; val: loss 0.358 accuracy 0.884 55/100 - train: loss 0.216; val: loss 0.423 accuracy 0.868 56/100 - train: loss 0.221; val: loss 0.372 accuracy 0.881 57/100 - train: loss 0.203; val: loss 0.406 accuracy 0.873 58/100 - train: loss 0.204; val: loss 0.340 accuracy 0.890 59/100 - train: loss 0.180; val: loss 0.337 accuracy 0.893 60/100 - train: loss 0.160; val: loss 0.374 accuracy 0.883 61/100 - train: loss 0.166; val: loss 0.349 accuracy 0.889 62/100 - train: loss 0.149; val: loss 0.356 accuracy 0.890 63/100 - train: loss 0.168; val: loss 0.361 accuracy 0.890 64/100 - train: loss 0.139; val: loss 0.432 accuracy 0.870 65/100 - train: loss 0.128; val: loss 0.354 accuracy 0.893 66/100 - train: loss 0.123; val: loss 0.357 accuracy 0.893 67/100 - train: loss 0.122; val: loss 0.367 accuracy 0.890 68/100 - train: loss 0.098; val: loss 0.382 accuracy 0.886 69/100 - train: loss 0.108; val: loss 0.459 accuracy 0.872 70/100 - train: loss 0.090; val: loss 0.363 accuracy 0.895 71/100 - train: loss 0.093; val: loss 0.365 accuracy 0.898 72/100 - train: loss 0.089; val: loss 0.374 accuracy 0.899 73/100 - train: loss 0.070; val: loss 0.383 accuracy 0.900 74/100 - train: loss 0.056; val: loss 0.358 accuracy 0.902 75/100 - train: loss 0.082; val: loss 0.396 accuracy 0.892 76/100 - train: loss 0.051; val: loss 0.373 accuracy 0.903 77/100 - train: loss 0.056; val: loss 0.384 accuracy 0.903 78/100 - train: loss 0.047; val: loss 0.376 accuracy 0.900 79/100 - train: loss 0.042; val: loss 0.377 accuracy 0.904 80/100 - train: loss 0.033; val: loss 0.421 accuracy 0.900 81/100 - train: loss 0.020; val: loss 0.385 accuracy 0.911 82/100 - train: loss 0.019; val: loss 0.387 accuracy 0.907 83/100 - train: loss 0.022; val: loss 0.374 accuracy 0.908 84/100 - train: loss 0.019; val: loss 0.393 accuracy 0.909 85/100 - train: loss 0.012; val: loss 0.397 accuracy 0.911 86/100 - train: loss 0.008; val: loss 0.391 accuracy 0.915 87/100 - train: loss 0.009; val: loss 0.388 accuracy 0.914 88/100 - train: loss 0.008; val: loss 0.391 accuracy 0.917 89/100 - train: loss 0.002; val: loss 0.401 accuracy 0.915 90/100 - train: loss 0.004; val: loss 0.403 accuracy 0.918 91/100 - train: loss 0.003; val: loss 0.409 accuracy 0.915 92/100 - train: loss 0.001; val: loss 0.399 accuracy 0.917 93/100 - train: loss 0.002; val: loss 0.396 accuracy 0.920 94/100 - train: loss 0.001; val: loss 0.393 accuracy 0.924 95/100 - train: loss 0.001; val: loss 0.387 accuracy 0.923 96/100 - train: loss 0.000; val: loss 0.390 accuracy 0.924 97/100 - train: loss 0.000; val: loss 0.392 accuracy 0.923 98/100 - train: loss 0.001; val: loss 0.391 accuracy 0.923 99/100 - train: loss 0.000; val: loss 0.391 accuracy 0.923 100/100 - train: loss 0.001; val: loss 0.391 accuracy 0.923
def plot_history_train_val(history, key):
fig = plt.figure()
ax = fig.add_subplot(111)
xs = np.arange(1, len(history['train ' + key]) + 1)
ax.plot(xs, history['train ' + key], '.-', label='train')
ax.plot(xs, history['val ' + key], '.-', label='val')
ax.set_xlabel('epoch')
ax.set_ylabel(key)
ax.legend()
ax.grid()
plt.show()
def plot_history(history, key):
fig = plt.figure()
ax = fig.add_subplot(111)
xs = np.arange(1, len(history[key]) + 1)
ax.plot(xs, history[key], '-')
ax.set_xlabel('epoch')
ax.set_ylabel(key)
ax.grid()
plt.show()
plot_history_train_val(history, 'loss')
plot_history(history, 'val acc')