论文链接:https://arxiv.org/pdf/1409.1556.pdf
VGG16由牛津大学工程科学系视觉几何小组(Visual Geometry Group, Department of Engineering Science, University of Oxford )提出,这是VGG名字的由来,16表示它的卷积层以及全连接层加起来共有16层。首先来看一下结构图(为更直观,未采用原文中结构): 可以看到,输入的图片大小为224 x 224,通道数为3 第一层(卷积层):224 x 224 x 64 第二层(卷积层):224 x 224 x 64 最大池化层:112 x 112 x 128 第三层(卷积层):112 x 112 x 128 第四层(卷积层):112 x 112 x 128 最大池化层:56 x 56 x 256 第五层(卷积层): 56 x 56 x 256 第六层(卷积层): 56 x 56 x 256 第七层(卷积层): 56 x 56 x 256 最大池化层:28 x 28 x 512 第八层(卷积层): 28 x 28 x 512 第九层(卷积层): 28 x 28 x 512 第十层(卷积层): 28 x 28 x 512 最大池化层:14 x 14 x 512 第十一层(卷积层): 14 x 14 x 512 第十二层(卷积层): 14 x 14 x 512 第十三层(卷积层): 14 x 14 x 512 最大池化层:7 x 7 x 512 第十四层(全连接层): 1 x 1 x 4096 第十五层(全连接层): 1 x 1 x 4096 第十六层(全连接层): 1 x 1 x 1000 softmax函数
通过不断卷积、池化来提取特征,通过全连接层以及softmax函数可实现分类。下面根据结构来写代码,其中M代表最大池化,base里面定义了每一层通道数
def vgg(cfg, i, batch_norm=False): layers = [] in_channels = i for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) if batch_norm: layers += [conv2d, nn.BatchNorm2d(v),nn.ReLU(inplace=True)] else: layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) conv6 = nn.Linear(512,4096) conv7 = nn.Linear(4096,4096) conv8 = nn.Linear(4096,1000) layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True), conv8, nn.ReLU(inplace=True)] return layers base = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512]下面基于CIFAR-10数据集实现分类
import torch import torch.nn as nn from torch import optim from torch.autograd import Variable from torch.utils.data import DataLoader from torchvision import transforms from torchvision import datasets from tqdm import tqdm '''定义超参数''' batch_size = 256 # 批的大小 learning_rate = 1e-3 # 学习率 num_epoches = 100 # 遍历训练集的次数 ''' transform = transforms.Compose([ transforms.RandomSizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ], std = [ 0.229, 0.224, 0.225 ]), ]) ''' '''下载训练集 CIFAR-10 10分类训练集''' train_dataset = datasets.CIFAR10('./data', train=True, transform=transforms.ToTensor(), download=True) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) test_dataset = datasets.CIFAR10('./data', train=False, transform=transforms.ToTensor(), download=True) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) '''定义网络模型''' class VGG16(nn.Module): def __init__(self, num_classes=10): super(VGG16, self).__init__() self.features = nn.Sequential( #1 nn.Conv2d(3,64,kernel_size=3,padding=1), nn.BatchNorm2d(64), nn.ReLU(True), #2 nn.Conv2d(64,64,kernel_size=3,padding=1), nn.BatchNorm2d(64), nn.ReLU(True), nn.MaxPool2d(kernel_size=2,stride=2), #3 nn.Conv2d(64,128,kernel_size=3,padding=1), nn.BatchNorm2d(128), nn.ReLU(True), #4 nn.Conv2d(128,128,kernel_size=3,padding=1), nn.BatchNorm2d(128), nn.ReLU(True), nn.MaxPool2d(kernel_size=2,stride=2), #5 nn.Conv2d(128,256,kernel_size=3,padding=1), nn.BatchNorm2d(256), nn.ReLU(True), #6 nn.Conv2d(256,256,kernel_size=3,padding=1), nn.BatchNorm2d(256), nn.ReLU(True), #7 nn.Conv2d(256,256,kernel_size=3,padding=1), nn.BatchNorm2d(256), nn.ReLU(True), nn.MaxPool2d(kernel_size=2,stride=2), #8 nn.Conv2d(256,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), #9 nn.Conv2d(512,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), #10 nn.Conv2d(512,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), nn.MaxPool2d(kernel_size=2,stride=2), #11 nn.Conv2d(512,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), #12 nn.Conv2d(512,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), #13 nn.Conv2d(512,512,kernel_size=3,padding=1), nn.BatchNorm2d(512), nn.ReLU(True), nn.MaxPool2d(kernel_size=2,stride=2), nn.AvgPool2d(kernel_size=1,stride=1), ) self.classifier = nn.Sequential( #14 nn.Linear(512,4096), nn.ReLU(True), nn.Dropout(), #15 nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), #16 nn.Linear(4096,num_classes), ) #self.classifier = nn.Linear(512, 10) def forward(self, x): out = self.features(x) # print(out.shape) out = out.view(out.size(0), -1) # print(out.shape) out = self.classifier(out) # print(out.shape) return out '''创建model实例对象,并检测是否支持使用GPU''' model = VGG16() use_gpu = torch.cuda.is_available() # 判断是否有GPU加速 if use_gpu: model = model.cuda() '''定义loss和optimizer''' criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) '''训练模型''' for epoch in range(num_epoches): print('*' * 25, 'epoch {}'.format(epoch + 1), '*' * 25) # .format为输出格式,formet括号里的即为左边花括号的输出 running_loss = 0.0 running_acc = 0.0 for i, data in tqdm(enumerate(train_loader, 1)): img, label = data # cuda if use_gpu: img = img.cuda() label = label.cuda() img = Variable(img) label = Variable(label) # 向前传播 out = model(img) loss = criterion(out, label) running_loss += loss.item() * label.size(0) _, pred = torch.max(out, 1) # 预测最大值所在的位置标签 num_correct = (pred == label).sum() accuracy = (pred == label).float().mean() running_acc += num_correct.item() # 向后传播 optimizer.zero_grad() loss.backward() optimizer.step() print('Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format( epoch + 1, running_loss / (len(train_dataset)), running_acc / (len(train_dataset)))) model.eval() # 模型评估 eval_loss = 0 eval_acc = 0 for data in test_loader: # 测试模型 img, label = data if use_gpu: img = Variable(img, volatile=True).cuda() label = Variable(label, volatile=True).cuda() else: img = Variable(img, volatile=True) label = Variable(label, volatile=True) out = model(img) loss = criterion(out, label) eval_loss += loss.item() * label.size(0) _, pred = torch.max(out, 1) num_correct = (pred == label).sum() eval_acc += num_correct.item() print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len( test_dataset)), eval_acc / (len(test_dataset)))) print() # 保存模型 torch.save(model.state_dict(), './cnn.pth')训练100批次,训练准确率可达到100%,测试准确类为75% 水平有限,可能会有所错误,有问题欢迎留言交流。
