Created
September 3, 2019 11:58
-
-
Save PrimadonnaGit/ed3c9f0cd2eb1171dca52f862c85de69 to your computer and use it in GitHub Desktop.
SSD detector for pytorch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torchvision.utils.data import Dataset | |
import json | |
import os | |
from PIL import Image | |
from utils import transform | |
class PascalVOCDataset(Dataset): | |
""" | |
A Pytorch Dataset class to be used in a PyTorch DataLoader to create batches. | |
""" | |
def __init__(self, data_folder, split, keep_difficult=False): | |
""" | |
:param data_folder: folder where data files are stored | |
:param split: split, on of 'TRAIN' or 'TEST' | |
:param keep_difficult: keep or discard objects that are considered difficult to detect | |
""" | |
self.split = split.upper() | |
assert self.split in {'TRAIN', 'TEST'} | |
self.data_folder = data_folder | |
self.keep_difficult = keep_difficult | |
# Read data files | |
with open(os.path.join(data_folder, self.split + '_images.json'), 'r') as j: | |
self.images = json.load(j) | |
with open(os.path.join(data_folder, self.split + '_objects.json'), 'r') as j: | |
self.objects = json.load(j) | |
assert len(self.images) == len(self.objects) | |
def __getitem__(self, i): | |
# Read image | |
image = Image.open(self.images[i], mode='r') | |
image = Image.convert('RGB') | |
# Read objects in this image (bounding boxes, labels, difficulties) | |
objects = self.objects[i] | |
boxes = torch.FloatTensor(objects['boxes']) # (n_objects, 4) | |
labels = torch.LongTensor(objects['labels']) # (n_objects) | |
difficulties = torch.ByteTensor(objects['difficulties']) # (n_objects) | |
# Discard difficult objects, if desired | |
if not self.keep_difficult: | |
boxes = boxes[1 - difficulties] | |
labels = labels[1 - difficulties] | |
difficulties = difficulties[1 - difficulties] | |
# Apply transformations | |
image, boxes, labels, difficulties = transform(image, boxes, labels, difficulties, split=self.split) | |
return image, boxes, labels, difficulties | |
def __len__(self): | |
return len(self.images) | |
def collate_fn(self, batch): | |
""" | |
Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader) | |
This describes how to combine these tensors of different sizes, We use lists. | |
Note: this need not be defined in this Class, can be standalone | |
:param batch: an iterable of N sets from __getitem__() | |
:return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties | |
""" | |
images = list() | |
boxes = list() | |
labels = list() | |
difficulties = list() | |
for b in batch: | |
images.append(b[0]) | |
boxes.append(b[1]) | |
labels.append(b[2]) | |
difficulties.append(b[3]) | |
images = torch.stack(images, dim=0) | |
return images, boxes, labels, difficulties # tensor (N, 3, 300, 300), 3 lists of N tensors each | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch import nn | |
from .utils import * | |
import torch.nn.functional as F | |
from math import sqrt | |
from itertools import product as product | |
import torchvision | |
import torch | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
class VGGBase(nn.Module): | |
""" | |
VGG base convolutions to produce lower-level feature maps. | |
""" | |
def __init__(self): | |
super(VGGBase, self).__init__() | |
# Standard convolutional layers in VGG16 | |
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1) # stride = 1, by default | |
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1) | |
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) | |
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1) | |
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1) | |
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) | |
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1) | |
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1) | |
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1) | |
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True) # ceiling (not floor) here for even dims | |
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1) | |
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1) | |
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1) | |
self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2) | |
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1) | |
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1) | |
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1) | |
self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) # retains size because stride is 1 (and padding) | |
# Replacements for FC6 and FC7 in VGG16 | |
self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # atrous convolution | |
self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) | |
# Load pretrained layers | |
self.load_pretrained_layers() | |
def forward(self, image): | |
""" | |
Forward propagation. | |
:param image: image, a tensor of dimensions (N, 3, 300, 300) | |
:return: lower-level feature maps conv4_3 and conv7 | |
""" | |
out = F.relu(self.conv1_1(image)) # (N, 64, 300, 300) | |
out = F.relu(self.conv1_2(out)) # (N, 64, 300, 300) | |
out = self.pool1(out) # (N, 64, 150, 150) | |
out = F.relu(self.conv2_1(out)) # (N, 128, 150, 150) | |
out = F.relu(self.conv2_2(out)) # (N, 128, 150, 150) | |
out = self.pool2(out) # (N, 128, 75, 75) | |
out = F.relu(self.conv3_1(out)) # (N, 256, 75, 75) | |
out = F.relu(self.conv3_2(out)) # (N, 256, 75, 75) | |
out = F.relu((self.conv3_3(out))) # (N, 256, 75, 75) | |
out = self.pool3(out) # (N, 256, 38, 38), it would have been 37 if not for ceil_mode True | |
out = F.relu(self.conv4_1(out)) # (N, 512, 38, 38) | |
out = F.relu(self.conv4_2(out)) # (N, 512, 38, 38) | |
out = F.relu(self.conv4_3(out)) # (N, 512, 38, 38) | |
conv4_3_feats = out # (N, 512, 38, 38) | |
out = self.pool4(out) # (N, 512, 19, 19) | |
out = F.relu(self.conv5_1(out)) # (N, 512, 19, 19) | |
out = F.relu(self.conv5_2(out)) # (N, 512, 19, 19) | |
out = F.relu(self.conv5_3(out)) # (N, 512, 19, 19) | |
out = self.pool5(out) # (N, 512, 19, 19), pool5 does not reduce dimensions | |
out = F.relu(self.conv6(out)) # (N, 1024, 19, 19) | |
conv7_feats = F.relu(self.conv7(out)) # (N, 1024, 19, 19) | |
# Lower-level feature maps | |
return conv4_3_feats, conv7_feats | |
def load_pretrained_layers(self): | |
""" | |
As in the paper, we use a VGG-16 pretrained on the ImageNet task as the base network. | |
We copy these parameters into our network. It's straightforward for conv1 to conv5 | |
However, the original VGG-16 does not contain the conv6 and con7 layers. | |
Therefore, we convert fc6 and fc7 into convolutional layers, and subsample by decimation. | |
""" | |
# Current state of base | |
state_dict =self.state_dict() | |
param_names = list(state_dict.keys()) | |
# Pretrained VGG base | |
pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict() | |
pretrained_param_names = list(pretrained_state_dict.keys()) | |
# Transfer conv. parameters from pretrained model to current model | |
for i, param in enumerate(param_names[:-4]): # excluding conv6 and conv7 parameters | |
state_dict[param] = pretrained_state_dict[pretrained_param_names[i]] | |
# Convert fc6, fc7 to convolutional layers, and subsample (by decimation) to sizes of conv6 and con7 | |
# fc6 | |
conv_fc6_weight = pretrained_state_dict['classifier.0.weight'].view(4096, 512, 7, 7) # (4096, 512, 7, 7) | |
conv_fc6_bias = pretrained_state_dict['classifier.0.bias'] # (4096) | |
state_dict['conv6.weight'] = decimate(conv_fc6_weight, m=[4, None, 3, 3]) # (1024, 512, 3, 3) | |
state_dict['conv6.bias'] = decimate(conv_fc6_bias, m=[4]) # (1024) | |
# fc7 | |
conv_fc7_weight = pretrained_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1) # (4096, 4096, 1, 1) | |
conv_fc7_bias = pretrained_state_dict['classifier.3.bias'] # (4096) | |
state_dict['conv7.weight'] = decimate(conv_fc7_weight, m=[4, 4, None, None]) # (1024, 1024, 1, 1) | |
state_dict['conv7.bias'] = decimate(conv_fc7_bias, m=[4]) # (1024) | |
# Note: an FC layer of size (K) opertaion on a flattened version (C*H*W) of a 2D image of size (C, H, W) | |
# is equivalent to a convolutional layer with kernel size (H, W), input channels C, output channels K | |
# operating on the 2D image of size (C, H, W) without padding | |
self.load_state_dict(state_dict) | |
print("\nLoaded base model.\n") | |
class AuxiliaryConvolutions(nn.Module): | |
""" | |
Additional convolutions to produce higher-level feature maps. | |
""" | |
def __init__(self): | |
super(AuxiliaryConvolutions, self).__init__() | |
# Auxiliary/additional convolutions on top of the VGG base | |
self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, padding=0) # stride = 1, by default | |
self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1) # dim. reduction because stride > 1 | |
self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1, padding=0) | |
self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1) # dim. reduction because stride > 1 | |
self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0) | |
self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0) # dim. reduction because padding = 0 | |
self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0) | |
self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0) # dim. reduction because padding = 0 | |
# Initialize convolutions parameters | |
self.init_conv2d() | |
def init_conv2d(self): | |
""" | |
Initialize convolution parameters. | |
""" | |
for c in self.children(): | |
if isinstance(c, nn.Conv2d): | |
nn.init.xavier_uniform_(c.weight) | |
nn.init.constant_(c.bias, 0.) | |
def forward(self, conv7_feats): | |
""" | |
Forward propagation. | |
:param conv7_feats: lower-level conv7 feature map, a tensor of dimensions (N, 1024, 19, 19) | |
:return: higher-level feature maps conv8_2, conv9_2, conv10_2, and conv11_2 | |
""" | |
out = F.relu(self.conv8_1(conv7_feats)) # (N, 256, 19, 19) | |
out = F.relu(self.conv8_2(out)) # (N, 512, 10, 10) | |
conv8_2_feats = out # (N, 512, 10, 10) | |
out = F.relu(self.conv9_1(out)) # (N, 128, 10, 10) | |
out = F.relu(self.conv9_2(out)) # (N, 256, 5, 5) | |
conv9_2_feats = out # (N, 256, 5, 5) | |
out = F.relu(self.conv10_1(out)) # (N, 128, 5, 5) | |
out = F.relu(self.conv10_2(out)) # (N, 256, 3, 3) | |
conv10_2_feats = out # (N, 256, 3, 3) | |
out = F.relu(self.conv11_1(out)) # (N, 128, 3, 3) | |
conv11_2_feats = F.relu(self.conv11_2(out)) # (N, 256, 1, 1) | |
# Higher-level feature maps | |
return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats | |
class PredictionConvolutions(nn.Module): | |
""" | |
Convolutions to predict class scores and bounding boxes using lower and higher-level feature maps. | |
The bounding boxes (locations) are predicted as encoded offsets w.r.t each of the 8732 prior (default) boxes. | |
The class scores represent the scores of each object class in each of the 8732 bounding boxes located. | |
A high score for 'background' = no object. | |
""" | |
def __init__(self, n_classes): | |
""" | |
:param n_classes: number of different types of objects | |
""" | |
super(PredictionConvolutions, self).__init__() | |
self.n_classes = n_classes | |
# Number of prior-boxes we are considering per position in each feature map | |
n_boxes = {'conv4_3': 4, | |
'conv7' : 6, | |
'conv8_2': 6, | |
'conv9_2': 6, | |
'conv10_2': 4, | |
'conv11_2': 4} | |
# 4 prior-boxes implies we use 4 different aspect ratios, etc. | |
# Localizaition prediction convolutions (predict offsets(4) w.r.t prior-boxes) | |
self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * 4, kernel_size=3, padding=1) | |
self.loc_conv7 = nn.Conv2d(1024, n_boxes['conv7'] * 4, kernel_size=3, padding=1) | |
self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * 4, kernel_size=3, padding=1) | |
self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * 4, kernel_size=3, padding=1) | |
self.loc_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * 4, kernel_size=3, padding=1) | |
self.loc_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * 4, kernel_size=3, padding=1) | |
# Class prediction convolutions (predict classes in localization boxes) | |
self.cl_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * n_classes, kernel_size=3, padding=1) | |
self.cl_conv7 = nn.Conv2d(1024, n_boxes['conv7'] * n_classes, kernel_size=3, padding=1) | |
self.cl_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * n_classes, kernel_size=3, padding=1) | |
self.cl_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * n_classes, kernel_size=3, padding=1) | |
self.cl_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * n_classes, kernel_size=3, padding=1) | |
self.cl_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * n_classes, kernel_size=3, padding=1) | |
# Initialize convolutions parameters | |
self.init_conv2d() | |
def init_conv2d(self): | |
""" | |
Initialize convolution parameters. | |
""" | |
for c in self.children(): | |
if isinstance(c, nn.Conv2d): | |
nn.init.xavier_uniform_(c.weight) | |
nn.init.constant_(c.bias, 0.) | |
def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats): | |
""" | |
Forward propagation. | |
:param conv4_3_feats: conv4_3 feature map, a tensor of dimensions (N, 512, 38, 38) | |
:param conv7_feats: conv7 feature map, a tensor of dimensions (N, 1024, 19, 19) | |
:param conv8_2_feats: conv8_2 feature map, a tensor of dimensions (N, 512, 10, 10) | |
:param conv9_2_feats: conv9_2 feature map, a tensor of dimensions (N, 256, 5, 5) | |
:param conv10_2_feats: conv10_2 feature map, a tensor of dimensions (N, 256, 3, 3) | |
:param conv11_2_feats: conv11_2 feature map, a tensor of dimensions (N, 256, 1, 1) | |
:return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image | |
""" | |
batch_size = conv4_3_feats.size(0) | |
# Predict localization boxes bounds (as offset w.r.t prior-boxes) | |
l_conv4_3 = self.loc_conv4_3(conv4_3_feats) # (N, 16, 38, 38) | |
l_conv4_3 = l_conv4_3.permute(0, 2, 3, | |
1).contiguous() # (N, 38, 38, 16), to match prior-box order (after .view()) | |
# (.contiguous() ensures it is stored in a contiguous chunk of memory, needed for .view() below) | |
l_conv4_3 = l_conv4_3.view(batch_size, -1, 4) # (N, 5776, 4), there are a total 5776 boxes on this feature map | |
l_conv7 = self.loc_conv7(conv7_feats) # (N, 24, 19, 19) | |
l_conv7 = l_conv7.permute(0, 2, 3, 1).contiguous() # (N, 19, 19, 24) | |
l_conv7 = l_conv7.view(batch_size, -1, 4) # (N, 2166, 4), there are total 2166 boxes on this feature map | |
l_conv8_2 = self.loc_conv8_2(conv8_2_feats) # (N, 24, 10, 10) | |
l_conv8_2 = l_conv8_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24) | |
l_conv8_2 = l_conv8_2.view(batch_size, -1, 4) # (N, 600, 4) | |
l_conv9_2 = self.loc_conv9_2(conv9_2_feats) # (N, 24, 5, 5) | |
l_conv9_2 = l_conv9_2.permute(0, 2, 3, 1).contiguous() # (N, 5, 5, 24) | |
l_conv9_2 = l_conv9_2.view(batch_size, -1, 4) # (N, 150, 4) | |
l_conv10_2 = self.loc_conv10_2(conv10_2_feats) # (N, 16, 3, 3) | |
l_conv10_2 = l_conv10_2.permute(0, 2, 3, 1).contiguous() # (N, 3, 3, 16) | |
l_conv10_2 = l_conv10_2.view(batch_size, -1, 4) # (N, 36, 4) | |
l_conv11_2 = self.loc_conv11_2(conv11_2_feats) # (N, 16, 1, 1) | |
l_conv11_2 = l_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 1, 1, 16) | |
l_conv11_2 = l_conv11_2.view(batch_size, -1, 4) # (N, 4, 4) | |
# Predict classes in localization boxes | |
c_conv4_3 = self.cl_conv4_3(conv4_3_feats) # (N, 4 * n_classes, 38, 38) | |
c_conv4_3 = c_conv4_3.permute(0, 2, 3, | |
1).contiguous() # (N, 38, 38, 4 * n_classes), to match prior-box order (after .view()) | |
c_conv4_3 = c_conv4_3.view(batch_size, -1, self.n_classes) # (N, 5776, n_classes), there are a total 5776 boxes on this feature map | |
c_conv7 = self.cl_conv7(conv7_feats) # (N, 6 * n_classes, 19, 19) | |
c_conv7 = c_conv7.permute(0, 2, 3, 1).contiguous() # (N, 19, 19, 6 * n_classes) | |
c_conv7 = c_conv7.view(batch_size, -1, self.n_classes) # (N, 2166, n_classes), there are a total 2166 boxes on this feature map | |
c_conv8_2 = self.cl_conv8_2(conv8_2_feats) # (N, 6 * n_classes, 10, 10) | |
c_conv8_2 = c_conv8_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 6 * n_classes) | |
c_conv8_2 = c_conv8_2.view(batch_size, -1, self.n_classes) # (N, 600, n_classes) | |
c_conv9_2 = self.cl_conv9_2(conv9_2_feats) # (N, 6 * n_classes, 5, 5) | |
c_conv9_2 = c_conv9_2.permute(0, 2, 3, 1).contiguous() # (N, 5, 5, 6 * n_classes) | |
c_conv9_2 = c_conv9_2.view(batch_size, -1, self.n_classes) # (N, 150, n_classes) | |
c_conv10_2 = self.cl_conv10_2(conv10_2_feats) # (N, 4 * n_classes, 3, 3) | |
c_conv10_2 = c_conv10_2.permute(0, 2, 3, 1).contiguous() # (N, 3, 3, 4 * n_classes) | |
c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes) # (N, 36, n_classes) | |
c_conv11_2 = self.cl_conv11_2(conv11_2_feats) # (N, 4 * n_classes, 1, 1) | |
c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 1, 1, 4 * n_classes) | |
c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes) # (N, 4, n_classes) | |
# A total of 8732 boxes (8732 = 5776 + 2166 + 600 + 150 + 36 + 4) | |
# Concatenate in this specific order (i.e. must match the order of the prior-boxes) | |
locs = torch.cat([l_conv4_3, l_conv7, l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1) # (N, 8732, 4) | |
classes_scores = torch.cat([c_conv4_3, c_conv7, c_conv8_2, c_conv8_2, c_conv10_2, c_conv11_2], dim=1) # (N, 8732, n_classes) | |
return locs, classes_scores | |
class SSD300(nn.Module): | |
""" | |
The SSD300 Network - encapsulates the base VGG network, auxiliary, and prediction convolutions. | |
""" | |
def __init__(self, n_classes): | |
super(SSD300, self).__init__() | |
self.n_classes = n_classes | |
self.base = VGGBase() | |
self.aux_convs = AuxiliaryConvolutions() | |
self.pred_convs = PredictionConvolutions(n_classes) | |
# Since lower level features (conv4_3 feats) have considerably larger scales, we take the L2 norm and rescale | |
# Rescale factor is initially set at 20, but is learned for each channel during back-prop | |
self.rescale_factors = nn.Parameter(torch.FloatTensor(1, 512, 1, 1)) # there are 512 channels in conv4_3_feats | |
nn.init.constant_(self.rescale_factors, 20) | |
# Prior boxes | |
self.priors_cxcy = self.create_prior_boxes() | |
def forward(self, image): | |
""" | |
Forward propagation. | |
:param image: images, a tensor of dimensions (N, 3, 300, 300) | |
:return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image | |
""" | |
# Run VGG base network convolutions (lower level feature map generators) | |
conv4_3_feats, conv7_feats = self.base(image) # (N, 512, 38, 38), (N, 1024, 19, 19) | |
# Rescale conv4_3 after L2 norm | |
norm = conv4_3_feats.pow(2).sum(dim=1, keepdim=True).sqrt() # (N, 1, 38, 38) | |
conv4_3_feats = conv4_3_feats / norm # (N, 512, 38, 38) | |
conv4_3_feats = conv4_3_feats * self.rescale_factors # (N, 512, 38, 38) | |
# (PyTorch autobroadcasts singleton dimensions during arithmetic) | |
# Run auxiliary convolutions (higher level feature map generators) | |
conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.aux_convs(conv7_feats) | |
# (N, 512, 10, 10), (N, 256, 5, 5), (N, 256, 3, 3), (N, 256, 1, 1) | |
# Run prediction convolutions (predict offsets w.r.t prior-boxes and classes in each resulting localization box) | |
locs, classes_scores = self.pred_convs(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats) | |
# (N, 8732, 4), (N, 8732, n_classes) | |
return locs, classes_scores | |
def create_prior_boxes(self): | |
""" | |
Create the 8732 prior (default) boxes for the SSD300 | |
:return: prior boxes in center-size coordinates, a tensor of dimensions (8732, 4) | |
""" | |
fmap_dims = {'conv4_3': 38, | |
'conv7': 19, | |
'conv8_2': 10, | |
'conv9_2': 5, | |
'conv10_2': 3, | |
'conv11_2': 1} | |
obj_scales = {'conv4_3': 0.1, | |
'conv7': 0.2, | |
'conv8_2': 0.375, | |
'conv9_2': 0.55, | |
'conv10_2': 0.725, | |
'conv11_2': 0.9} | |
aspect_ratios = {'conv4_3': [1., 2., 0.5], | |
'conv7': [1., 2., 3., 0.5, .333], | |
'conv8_2': [1., 2., 3., 0.5, .333], | |
'conv9_2': [1., 2., 3., 0.5, .333], | |
'conv10_2': [1., 2., 0.5], | |
'conv11_2': [1., 2., 0.5]} | |
fmaps = list(fmap_dims.keys()) | |
prior_boxes = [] | |
for k, fmap in enumerate(fmaps): | |
for i in range(fmap_dims[fmap]): | |
for j in range(fmap_dims[fmap]): | |
cx = (j + 0.5) / fmap_dims[fmap] | |
cy = (i + 0.5) / fmap_dims[fmap] | |
for ratio in aspect_ratios[fmap]: | |
prior_boxes.append([cx, cy, obj_scales[fmap] * sqrt(ratio), obj_scales[fmap] / sqrt(ratio)]) # (cx, cy, w, h) | |
# For an aspect ratio of 1, use an additional prior whose scale is the geometric mean of the | |
# scale of the current feature map and the scale of the next feature map | |
if ratio == 1.: | |
try: | |
additional_scale = sqrt(obj_scales[fmap] * obj_scales[fmaps[k + 1]]) | |
# For the last feature map, there is no "next" feature map | |
except IndexError: | |
additional_scale = 1. | |
prior_boxes.append([cx, cy, additional_scale, additional_scale]) | |
prior_boxes = torch.FloatTensor(prior_boxes).to(device) # (8732, 4) | |
prior_boxes.clamp_(0, 1) # (8732, 4) | |
return prior_boxes | |
def detect_objects(self, predicted_locs, predicted_scores, min_score, max_overlap, top_k): | |
""" | |
Decipher the 8732 locations and class scores (output of this SSD300) to detect objects. | |
For each class, perform Non-Maximum Suppression (NMS) on boxes that are above a minimum threshold. | |
:param predicted_locs: predicted locations/boxes w.r.t the 8732 prior boxes, a tensor of dimensions (N, 8732, 4) | |
:param predicted_scores: class scores for each of the encoded locations/boxes, a tensor of dimensions (N, 8732, n_classes) | |
:param min_score: minimum threshold for a box to be considered a match for a certain class | |
:param max_overlap: maximum overlap two boxes can have so that the one with the lower score is not suppressed via NMS | |
:param top_k: if there are a lot of resulting detection across all classes, keep only the top 'k' | |
:return: detections (boxes, labels, and scores), lists of length batch_size | |
""" | |
batch_size = predicted_locs.size(0) | |
n_priors = self.priors_cxcy.size(0) | |
predicted_scores = F.softmax(predicted_scores, dim=2) # (N, 8732, n_classes) | |
# Lists to store final predicted boxes, labels, and scores for all images | |
all_images_boxes = list() | |
all_images_labels = list() | |
all_images_scores = list() | |
assert n_priors == predicted_locs.size(1) == predicted_scores.size(1) | |
for i in range(batch_size): | |
# Decode object coordinates from the form we regressed predicted boxes to | |
decoded_locs = cxcy_to_xy( | |
gcxgcy_to_cxcy(predicted_locs[i], self.priors_cxcy)) # (8732, 4), these are fractional pt. coordinates | |
# Lists to store boxes and scores for this image | |
image_boxes = list() | |
image_labels = list() | |
image_scores = list() | |
max_score, best_label = predicted_scores[i].max(dim=1) # (8732) | |
# Check for each class | |
for c in range(1, self.n_classes): | |
# Keep only predicted boxes and scores where scores for this class are above minimum score | |
class_scores = predicted_scores[i][:, c] # (8732) | |
score_above_min_score = class_scores > min_score # torch.uint8 (byte) tensor, for indexing | |
n_above_min_score = score_above_min_score.sum().item() | |
if n_above_min_score == 0: | |
continue | |
class_scores = class_scores[score_above_min_score] # (n_qualified), n_min_score <= 8732 | |
class_decoded_locs = decoded_locs[score_above_min_score] # (n_qualified, 4) | |
# Sort predicted boxes and scores by scores | |
class_scores, sort_ind = class_scores.sort(dim=0, descending= True) # (n_qualified), (n_min_score) | |
class_decoded_locs = class_decoded_locs[sort_ind] # (n_min_score, 4) | |
# Find the overlap between predicted boxes | |
overlap = find_jaccard_overlap(class_decoded_locs, class_decoded_locs) # (n_qualified, n_min_score) | |
# Non-Maximum Suppression (NMS) | |
# A torch.unit8 (byte) tensor to keep track of which predicted boxes to suppress | |
# 1 implies suppress, 0 implies don't suppress | |
suppress = torch.zeros((n_above_min_score), dtype=torch.uint8).to(device) # (n_qulified) | |
# Consider each box in order of decreasing scores | |
for box in range(class_decoded_locs.size(0)): | |
# If this box is aleready marked for suppression | |
if suppress[box] == 1: | |
continue | |
# Suppress boxes whose overlaps (with this box) are greater than maximum overlap | |
# Find such boxes and update suppress indices | |
suppress = torch.max(suppress, overlap[box] > max_overlap) | |
# The max operation retains previously suppressed boxes, like an 'OR' operation | |
# Don't suppress this box, even though it has and overlap of 1 with itself | |
suppress[box] = 0 | |
# Store only unsuppressed boxes for this class | |
image_boxes.append(class_decoded_locs[1 - suppress]) | |
image_labels.append(torch.LongTensor((1 - suppress).sum().item() * [c]).to(device)) | |
image_scores.append(class_scores[1 - suppress]) | |
# If no object in any class is found, store a placeholder for 'background' | |
if len(image_boxes) == 0: | |
image_boxes.append(torch.FloatTensor([0., 0., 1., 1.]).to(device)) | |
image_labels.append(torch.LongTensor([0]).to(device)) | |
image_scores.append(torch.FloatTensor([0.]).to(device)) | |
# Concatenate into single tensors | |
image_boxes = torch.cat(image_boxes, dim=0) # (n_objects, 4) | |
image_labels = torch.cat(image_labels, dim=0) # (n_objects) | |
image_scores = torch.cat(image_scores, dim=0) # (n_objects) | |
n_objects = image_scores.size(0) | |
# keep only the top k objects | |
if n_objects > top_k: | |
image_scores, sort_ind = image_scores.sort(dim=0, descending= True) | |
image_scores = image_scores[:top_k] # (top_k) | |
image_boxes = image_boxes[sort_ind][:top_k] # (top_k, 4) | |
image_labels = image_labels[sort_ind][:top_k] # (top_k) | |
# Append to lists that store predicted boxes and scores for all images | |
all_images_boxes.append(image_boxes) | |
all_images_labels.append(image_labels) | |
all_images_scores.append(image_scores) | |
return all_images_boxes, all_images_labels, all_images_scores # lists of length batch_size |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import torch | |
import random | |
import xml.etree.ElementTree as ET | |
import torchvision.transforms.functional as FT | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
## Label map | |
voc_labels = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', | |
'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') | |
print('Length of labels : {}'.format(len(voc_labels))) | |
label_map = {k: v+1 for v, k in enumerate(voc_labels)} | |
label_map['background'] = 0 | |
rev_label_map = {v: k for k, v in label_map.items()} # Inverse mapping | |
print(rev_label_map) | |
# Color map for bounding boxes of detected objects | |
# from https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/ | |
distinct_colors = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', | |
'#d2f53c', '#fabebe', '#008080', '#000080', '#aa6e28', '#fffac8', '#800000', '#aaffc3', '#808000', | |
'#ffd8b1', '#e6beff', '#808080', '#FFFFFF'] | |
label_color_map = {k: distinct_colors[i] for i, k in enumerate(label_map.keys())} | |
print(label_color_map) | |
## | |
def parse_annotation(annotation_path): | |
tree = ET.parse(annotation_path) | |
root = tree.getroot() | |
boxes = list() | |
labels = list() | |
difficulties = list() | |
for object in root.iter('object'): | |
difficult = int(object.find('difficult').text == '1') | |
label = object.find('name').text.lower().strip() | |
if label not in label_map: | |
continue | |
bbox = object.find('bndbox') | |
xmin = int(bbox.find('xmin').text) - 1 | |
ymin = int(bbox.find('ymin').text) - 1 | |
xmax = int(bbox.find('xmax').text) - 1 | |
ymax = int(bbox.find('ymax').text) - 1 | |
boxes.append([xmin, ymin, xmax, ymax]) | |
labels.append(label_map[label]) | |
difficulties.append(difficult) | |
return {'boxes' : boxes, | |
'labels' : labels, | |
'difficulties' : difficulties} | |
def create_data_lists(voc07_path, voc12_path, output_folder): | |
""" | |
Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file. | |
:param voc07_path: path to the 'VOC2007' Folder | |
:param voc12_path: path to the 'VOC2012' Folder | |
:param output_folder: folder where the JSONs must be saved | |
""" | |
voc07_path = os.path.abspath(voc07_path) | |
voc12_path = os.path.abspath(voc12_path) | |
train_images = list() | |
train_objects = list() | |
n_objects = 0 | |
# Traning data | |
for path in [voc07_path, voc12_path]: | |
# Find IDs of images in traning data | |
with open(os.path.join(path, 'ImageSets/Main/trainval.txt')) as f: | |
ids = f.read().splitlines() | |
for id in ids: | |
# Parse annotation's XML file | |
objects = parse_annotation(os.path.join(path, 'Annotations', id + '.xml')) | |
if len(objects) == 0: | |
continue | |
n_objects += len(objects) | |
train_objects.append(objects) | |
train_images.append(os.path.join(path, 'JPEGImages', id + '.jpg')) | |
assert len(train_objects) == len(train_images) | |
# Save to file | |
with open(os.path.join(output_folder, 'TRAIN_images.json'), 'w') as j: | |
json.dump(train_images, j) | |
with open(os.path.join(output_folder, 'TRAIN_objects.json'), 'w') as j: | |
json.dump(train_objects, j) | |
with open(os.path.join(output_folder, 'label_map.json'), 'w') as j: | |
json.dump(label_map, j) | |
print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % ( | |
len(train_images), n_objects, os.path.abspath(output_folder))) | |
# Validation data | |
test_images = list() | |
test_objects = list() | |
n_objects = 0 | |
# Find IDs of images in validation data | |
with open(os.path.join(voc07_path, 'ImageSets/Main/test.txt')) as f: | |
ids = f.read.splitlines() | |
for id in ids: | |
# Parse annotation's XML files | |
objects = parse_annotation(os.path.join(voc07_path, 'Annotations', id + '.xml')) | |
if len(objects) == 0: | |
continue | |
test_objects.append(objects) | |
n_objects += len(objects) | |
test_images.append(os.path.join(voc07_path, 'JPEGImages', id + '.jpg')) | |
assert len(test_images) == len(test_objects) | |
# Save to file | |
with open(os.path.join(output_folder, 'TEST_images.json'), 'w') as j: | |
json.dump(test_images, j) | |
with open(os.path.join(output_folder, 'TEST_objects.json'), 'w') as j: | |
json.dump(test_objects, j) | |
print('\nThere are %d validation images containing a total of %d objects. Files have been saved to %s.' % ( | |
len(test_images), n_objects, os.path.abspath(output_folder))) | |
def decimate(tensor, m): | |
""" | |
Decimate a tensor by a factor 'm', i.e. downsample by keeping every 'm'th value. | |
This is used when we convert FC layers to equivalent Convolutional layers, BUT of a smaller size. | |
:param tensor: tensor to be decimated | |
:param m: list of decimation factors for each dimension of the tensor; None if Not to be decimated along a dimension | |
:return: decimated tensor | |
""" | |
assert tensor.dim() == len(m) | |
for d in range(tensor.dim()): | |
if m[d] is not None: | |
tensor = tensor.index_select(dim=d, index=torch.arange(start=0, end=tensor.size(d), step= m[d]).long()) | |
return tensor | |
def photometric_distort(image): | |
""" | |
Distort brightness, contrast, saturation, and hue, each with a 50% chance, in random order. | |
:param image: image, a PIL Image | |
:return: distorted image | |
""" | |
new_image = image | |
distortions = [FT.adjust_brightness, | |
FT.adjust_contrast, | |
FT.adjust_saturation, | |
FT.adjust_hue] | |
random.shuffle(distortions) | |
for d in distortions: | |
if random.random() < 0.5: | |
if d.__name__ is 'adjust_hue': | |
# Caffe repo uses a 'hue_delta' of 18 - we divide by 255 because PyTorch needs a normalized value | |
adjust_factor = random.uniform(-18 / 255., 18 / 255.) | |
else: | |
# Caffe repo uses 'lower' and 'upper' values of 0.5 and 1.5 for brightness, contrast, and saturation | |
adjust_factor = random.uniform(0.5, 1.5) | |
# Apply this distortion | |
new_image = d(new_image, adjust_factor) | |
return new_image | |
def cxcy_to_xy(cxcy): | |
""" | |
Convert bounding boxes from center-size coordinates (c_x, c_y, w, h) to boundary coordinates (x_min, y_min, x_max, y_max) | |
:param cxcy: bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4) | |
:return: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4) | |
""" | |
return torch.cat([cxcy[:, :2] - (cxcy[:, 2:] / 2), # x_min, y_min | |
cxcy[:, :2] + (cxcy[:, 2:] / 2)], 1) # x_max, y_max | |
def gcxgcy_to_cxcy(gcxgcy, priors_cxcy): | |
""" | |
Decode bounding box coordinates predicted by the model, since they are encoded in the form mentioned above. | |
They are decoded into center-size coordinates. | |
This is the inverse of the function above. | |
:param gcxgcy: encoded bounding boxes, i.e output of the model, a tensor of size (n_priors, 4) | |
:param priors_cxcy: prior boxes with respect to which the encoding is defined, a tensor of size (n_priors, 4) | |
:return: decoded bounding boxes in center-size form, a tensor of size (n_priors, 4) | |
""" | |
return torch.cat([gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2], # c_x, c_y | |
torch.exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:]], 1) # w, h | |
def expand(image, boxes, filler): | |
""" | |
Perform a zooming out operation by placing the image in a larger canvas of filler material. | |
:param image: image, a tensor of dimensions (3, original_h, original_w) | |
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) | |
:param filler: RBG values of the filler material, a list like [R, G, B] | |
:return: expanded image, updated bounding box coordinates | |
""" | |
# Calculate dimensions of proposed expanded (zoomed-out) image | |
original_h = image.size(1) | |
original_w = image.size(2) | |
max_scale = 4 | |
scale = random.uniform(1, max_scale) | |
new_h = int(scale * original_h) | |
new_w = int(scale * original_w) | |
# Create such an image with the filler | |
filler = torch.FloatTensor(filler) # (3) | |
new_image = torch.ones((3, new_h, new_w), dtype=torch.float) * filler.unsqueeze(1).unsqueeze(1) # (3, new_h, new_w) | |
# Note - do not use expand() list new_image = filler.unsqueeze(1).unsqueeze(1).expand(3, new_h, new_w) | |
# because all expanded values will share the same memory, so changing on pixel will change all | |
# Place the original image at random coordinates in this new image (origin at top-left of image) | |
left = random.randint(0, new_w - original_w) | |
right = left + original_w | |
top = random.randint(0, new_h - original_h) | |
bottom = top + original_h | |
new_image[:, top:bottom, left::right] = image | |
# Adjust bounding boxes coordinates accordingly | |
new_boxes = boxes + torch.FloatTensor([left, top, left, top]).unsqueeze(0) # (n_objects, 4) | |
return new_image, new_boxes | |
def find_intersection(set_1, set_2): | |
""" | |
Find the intersection of every box combination between two sets of boxes that are in boundary coordinates. | |
:param set_1: set 1, a tensor of dimensions (n1, 4) | |
:param set2: set 2, a tensor of dimensions (n2, 4) | |
:return: intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimension (n1, n2) | |
""" | |
# PyTorch auto-broadcast singleton dimensions | |
lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0)) # (n1, n2, 2) 2 : inter_min_x, inter_min_y | |
upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0)) # (n1, n2, 2) 2 : inter_max_x, inter_max_y | |
intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0) # (n1, n2, 2) | |
return intersection_dims[:, :, 0] * intersection_dims[:, :, 1] # (n1, n2) # Calculate Area | |
def find_jaccard_overlap(set_1, set_2): | |
""" | |
Find the Jaccard Overlap (IoU) of every box combination between two sets of boxes that are in boundary coordinates. | |
:param set_1: set 1, a tensor of dimensions (n1, 4) | |
:param set_2: set 2, a tensor of dimensions (n2, 4) | |
:return: Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2) | |
""" | |
# Find intersections | |
intersection = find_intersection(set_1, set_2) # (n1, n2) | |
# Find areas of each box in both sets | |
areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1]) # (n1) | |
areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 2]) # (n2) | |
# Find the union | |
# PyTorch auto-broadcasts singleton dimensions | |
union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection # (n1,n2) | |
iou = intersection / union # (n1, n2) | |
return iou | |
def random_crop(image, boxes, labels, difficulties): | |
""" | |
Performs a random crop in the manner stated in the paper. Helps to learn to detect larger and partial objects. | |
Note that some objects may be cut out entirely | |
:param image: image, a tensor of dimensions (3, original_h, original_w) | |
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) | |
:param labels: labels of objects, a tensor of dimensions (n_objects) | |
:param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects) | |
:return: cropped image, updated bounding box coordinates, updated labels, updated difficulties | |
""" | |
original_h = image.size(1) | |
original_w = image.size(2) | |
# Keep choosing aa minimum overlap until a successful crop is made | |
while True: | |
# Randomly draw the value for minimum overlap | |
min_overlap = random.choice([0., .1, .3, .5, .7, .9, None]) # 'None' refer to no cropping | |
# If not cropping | |
if min_overlap is None: | |
return image, boxes, labels, difficulties | |
# Try up to 50 times for this choice of minimum overlap | |
# This isn't mentioned in the paper, but 50 is chosen in paper author's Caffe repo | |
max_trials = 50 | |
for _ in range(max_trials): | |
# Crop dimensions must be in [0.3, 1] of original dimensions | |
# Note - it's [0.1, 1] in the paper, but actually [0.3, 1] in the author's repo | |
min_scale = 0.3 | |
scale_h = random.uniform(min_scale, 1) | |
scale_w = random.uniform(min_scale, 1) | |
new_h = int(scale_h * original_h) | |
new_w = int(scale_w * original_w) | |
# Aspect ratio has to be in [0.5, 2] | |
aspect_ratio = new_h / new_w | |
if not 0.5 < aspect_ratio < 2: | |
continue | |
# Crop coordinates (origin at top-left of image) | |
left = random.randint(0, original_w - new_w) | |
right = left + new_w | |
top = random.randint(0, original_h - new_h) | |
bottom = top + new_h | |
crop = torch.FloatTensor([left, top, right, bottom]) # (4) | |
# Calculate Jaccard overlap between the crop and the bounding boxes | |
overlap = find_jaccard_overlap(crop.unsqueeze(0), boxes) # (1, n_objects), n_objects is the no. of objects in this image | |
overlap = overlap.squeeze(0) # (n_objects) | |
# If not a single bounding box has a Jaccard overlap of greater than the minimum, try again | |
if overlap.max().item() < min_overlap: | |
continue | |
# Crop image | |
new_image = image[:, top:bottom, left:right] # (3, new_h, new_w) | |
# Find centers of original bounding boxes | |
bb_centers = (boxes[:, :2] + boxes[:, 2:]) / 2. # (n_objects, 2) | |
# Find bounding boxes whose centers are in the crop | |
centers_in_crop = (bb_centers[:, 0] > left) * (bb_centers[:, 0] < right) * (bb_centers[:, 1] > top) * (bb_centers[:, 1] < bottom) # (n_objects) | |
# If not a single bounding box has its center in the crop, try again | |
if not centers_in_crop.any(): | |
continue | |
# Discard bounding boxes that don't meet this criterion | |
new_boxes = boxes[centers_in_crop, :1] | |
new_labels = labels[centers_in_crop] | |
new_difficulties = difficulties[centers_in_crop] | |
# Calculate bounding boxes new coordinates in the crop | |
new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2]) # crop[:2] is [left, top] | |
new_boxes[:, :2] -= crop[:2] | |
new_boxes[:, 2:] = torch.min(new_boxes[:, 2:], crop[2:]) # crop[2:] is [right, bottom] | |
new_boxes[:, 2:] -= crop[:2] | |
return new_image, new_boxes, new_labels, new_difficulties | |
def flip(image, boxes): | |
""" | |
Flip image horizontally. | |
:param image: image, a PIL image | |
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) | |
:return: flipped image, updated bounding box coordinates | |
""" | |
# Flip image | |
new_image = FT.hflip(image) | |
# Flip boxes | |
new_boxes = boxes | |
new_boxes[:, 0] = image.width - boxes[:, 0] - 1 | |
new_boxes[:, 2] = image.width - boxes[:, 2] - 1 | |
new_boxes = new_boxes[:, [2, 1, 0, 3]] | |
return new_image, new_boxes | |
def resize(image, boxes, dims=(300,300), return_percent_coords=True): | |
""" | |
Resize image. For the SSD300, resize to (300, 300) | |
Since percent/fractional coordinates are calculated for the bounding boxes (w.r.t image dimensions) in this process, | |
you may choose to retain them. | |
:param image: image, a PIL Image | |
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) | |
:return: resized image, updated bounding box coordinates (or fractional coordinates, in which case they remain the same) | |
""" | |
# Resize image | |
new_image = FT.resize(image, dims) | |
# Resize bounding boxes | |
old_dims = torch.FloatTensor([image.width, image.height, image.width, image.height]).unsqueeze(0) | |
new_boxes = boxes / old_dims # percent coordinates | |
if not return_percent_coords: | |
new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0) | |
new_boxes = new_boxes * new_dims | |
return new_image, new_boxes | |
def transform(image, boxes, labels, difficulties, split): | |
""" | |
Apply the transformations above. | |
:param image: image, a PIL Image | |
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) | |
:param labels: labels of objects, a tensor of dimensions (n_objects) | |
:param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects) | |
:param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied | |
:return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties | |
""" | |
assert split in {'TRAIN', 'TEST'} | |
# Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on | |
mean = [0.485, 0.456, 0.406] | |
std = [0.229, 0.224, 0.225] | |
new_image = image | |
new_boxes = boxes | |
new_labels = labels | |
new_difficulties = difficulties | |
# Skip the following operations if validation/evaluation | |
if split == 'TRAIN': | |
# A series of photometic distortions in random order, each with 50% chance of occurrence, as in Caffe repo | |
new_image = photometric_distort(new_image) | |
# Convert PIL image to Torch tensor | |
new_image = FT.to_tensor(new_image) | |
# Expand image (zoom out) with a 50% chance - helpful for training detection of small objects | |
# Fill surrounding space with the mean of ImageNet data that our base VGG was trained on | |
if random.random < 0.5: | |
new_image, new_boxes = expand(new_image, filler=mean) | |
# Randomly crop image (zoom in) | |
new_image, new_boxes, new_labels, new_difficulties = random_crop(new_image, new_boxes, new_labels, new_difficulties) | |
# Convert Torch tensor to PIL image | |
new_image = FT.to_pil_image(new_image) | |
# Flip image with a 50% chance | |
if random.random() < 0.5: | |
new_image, new_boxes = flip(new_image, new_boxes) | |
# Resize image to (300, 300) - this also converts absolute boundary coordinates to their fractional form | |
new_image, new_boxes = resize(new_image, new_boxes, dims=(300,300)) | |
# Convert PIL image to Torch tensor | |
new_image = FT.to_tensor(new_image) | |
# Normalize by mean and standard deviation of ImageNet | |
new_image = FT.normalize(new_image, mean=mean, std=std) | |
return new_image, new_boxes, new_labels, new_difficulties | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment