Skip to content

Instantly share code, notes, and snippets.

@wbuchwalter
Last active February 28, 2020 23:08
Show Gist options
  • Save wbuchwalter/7046315e2884f7c77fdda97316be8481 to your computer and use it in GitHub Desktop.
Save wbuchwalter/7046315e2884f7c77fdda97316be8481 to your computer and use it in GitHub Desktop.
# CALL WITH:
import argparse
from shared_code.custom_datasets import test_new_stuff
parser = argparse.ArgumentParser()
parser.add_argument('i', type=int, help='GPU Index')
parser.add_arguments('s', type=int, help='dataset size')
args = parser.parse_args()
test_new_stuff(args.i, args.s)
#-----------------------------
import pdb
import os
import os.path
import re
import sys
import json
import time
import copy
import itertools
import pickle
import numpy as np
import torch
from PIL import Image
from collections import defaultdict
from torchvision.datasets.vision import VisionDataset
from torch._six import container_abcs, string_classes, int_classes
from .detectron_models import load_predictor_coco, load_predictor_vg, \
dummy_aux_data_dict, extract_raw_features, process_raw_features, \
update_box_features
def load_image(img_path, img_format='cv2'):
'''
Load an image from img_path and return either cv2 or PIL-style image.
'''
assert (img_format in ['cv2', 'PIL']), 'img_format must be cv2 or PIL.'
raw_img = Image.open(img_path).convert('RGB')
if img_format == 'cv2':
raw_img = np.array(raw_img)
raw_img = raw_img[:, :, ::-1].copy()
return raw_img
class CustomCocoCaptions(VisionDataset):
"""`MS Coco Captions <http://mscoco.org/dataset/#captions-challenge2015>`_ Dataset.
-- This custom version adds functionality for loading dicts of auxiliary data.
Args:
root : directory holding the source images
root_aux : directory holding the auxiliary data dicts (in .pkl form)
annFile : path to JSON annotations for linking images/captions/aux data
transform : transforms for augmenting/etc source images for the model
"""
def __init__(self, root, root_aux, annFile, transform=None):
super(CustomCocoCaptions, self).__init__(root, None, transform, None)
self.root_img = root
self.root_aux = root_aux # directory holding per-image auxiliary pkl files
self.coco = CustomCoCo(annFile)
self.ids = list(sorted(self.coco.imgs.keys()))
print('Loading CoCo dataset with {} images...'.format(len(self.coco.imgs)))
def __getitem__(self, index):
"""
Args:
index (int) : index of item to get (in pytorch frame of reference)
Returns:
tuple (stuff) : data that will come out of the data loader
"""
coco = self.coco
# get coco index for the image with this pytorch/dataset index
img_id = self.ids[index]
# get ids of annotations associated with this image
ann_ids = coco.getAnnIds(imgIds=img_id)
anns = coco.loadAnns(ann_ids)
# process the list of annotations for this image
target = []
aux_file = None
for ann in anns:
if 'caption' in ann:
# found a caption!
target.append(ann['caption'])
elif 'aux_file' in ann:
# found an auxiliary data file!
aux_file = ann['aux_file']
# get info dict for the current image
img_info = coco.loadImgs(img_id)[0]
# load image file using path from info dict
img_path = os.path.join(self.root_img, img_info['file_name'])
raw_img = load_image(img_path, img_format='PIL')
# load auxiliary info file if available -- it should just be a dict!
if aux_file is not None:
aux_path = os.path.join(self.root_aux, aux_file)
aux_dict = pickle.load(open(aux_path, 'rb'))
else:
aux_dict = {}
# apply augmentation/transforms to raw image to prep it for model
if self.transforms is not None:
img, target = self.transforms(raw_img, target)
# shove raw_img into aux_dict
# aux_dict['raw_img_pil'] = raw_img
return img, target, aux_dict
def __len__(self):
return len(self.ids)
np_str_obj_array_pattern = re.compile(r'[SaUO]')
coco_collate_err_msg_format = (
"coco_collate: batch must contain tensors, numpy arrays, numbers, "
"dicts or lists; found {}")
def coco_collate(batch):
r"""Puts each data field into a tensor with outer dimension batch size"""
elem = batch[0]
elem_type = type(elem)
if isinstance(elem, torch.Tensor):
out = None
if torch.utils.data.get_worker_info() is not None:
# If we're in a background process, concatenate directly into a
# shared memory tensor to avoid an extra copy
numel = sum([x.numel() for x in batch])
storage = elem.storage()._new_shared(numel)
out = elem.new(storage)
return torch.stack(batch, 0, out=out)
elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
and elem_type.__name__ != 'string_':
elem = batch[0]
if elem_type.__name__ == 'ndarray':
# array of string classes and object
if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
raise TypeError(coco_collate_err_msg_format.format(elem.dtype))
return coco_collate([torch.as_tensor(b) for b in batch])
elif elem.shape == (): # scalars
return torch.as_tensor(batch)
elif isinstance(elem, float):
return torch.tensor(batch, dtype=torch.float64)
elif isinstance(elem, int_classes):
return torch.tensor(batch)
elif isinstance(elem, string_classes):
return batch
# DO NOT COLLATE DICTS/MAPPINGS FOR COCO -- CHRIST :-(
elif isinstance(elem, container_abcs.Mapping):
return {key: coco_collate([d[key] for d in batch]) for key in elem}
# return batch
elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
return elem_type(*(coco_collate(samples) for samples in zip(*batch)))
elif isinstance(elem, container_abcs.Sequence):
transposed = zip(*batch)
return [coco_collate(samples) for samples in transposed]
raise TypeError(coco_collate_err_msg_format.format(elem_type))
#
# HELPER STUFF FOR WORKING WITH COCO-STYLE ANNOTATED IMAGE DATASETS
#
def _isArrayLike(obj):
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class CustomCoCo:
def __init__(self, annotation_file):
"""
Constructor of Microsoft CustomCoCo helper class for reading annotations.
:param annotation_file (str): location of annotation file
:return:
"""
# ...
self.dataset, self.anns, self.cats, self.imgs = \
dict(), dict(), dict(), dict()
self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
# load dataset
print('Loading annotations into memory...')
tic = time.time()
annotation_file = open(annotation_file, 'r')
dataset = json.load(annotation_file)
annotation_file.close()
assert (type(dataset) == dict), 'annotation file format {} not supported'.format(type(dataset))
# ...
self.dataset = dataset
self.buildIndex()
print('Done (t={:0.2f}s)'.format(time.time() - tic))
def buildIndex(self):
# build index for retrieving things from the current self.dataset
print('creating index...')
anns, cats, imgs = {}, {}, {}
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
imgToAnns[ann['image_id']].append(ann)
anns[ann['id']] = ann
if 'images' in self.dataset:
for img in self.dataset['images']:
imgs[img['id']] = img
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToImgs[ann['category_id']].append(ann['image_id'])
print('index created!')
# create class members
self.anns = anns # map from ann ids to anns
self.imgToAnns = imgToAnns # map from img ids to their anns
self.catToImgs = catToImgs # map from cat ids to their imgs
self.imgs = imgs # map from img ids to imgs
self.cats = cats # map from cat ids to cats
def saveDataset(self, output_file):
'''
Save the current set of coco-style annotations to some file (JSON).
'''
self.buildIndex() # rebuild index to make sure it's up-to-date
print('Writing dataset JSON to: {0:s}'.format(output_file))
json_file = open(output_file, "w+")
json_file.write(json.dumps(self.dataset))
json_file.close()
def addAnn(self, ann_dict):
'''
Add a dict-style annotation to the dataset.
-- check if this annotation is already in dataset before adding it
'''
img_id = ann_dict['image_id'] # image this annotation belongs to
# check that the annotation is not already in the dataset
ann_exists = False
for ann in self.imgToAnns[img_id]:
ann_match = True
for k in ann_dict:
if not (k in ann):
# there's a key that doesn't match
ann_match = False
break
elif not (ann_dict[k] == ann[k]):
# there's a value that doesn't match
ann_match = False
break
if ann_match:
ann_exists = True
break
# if the annotation doesn't exist yet, then add it to dataset
if not ann_exists:
self.dataset['annotations'].append(ann_dict)
self.imgToAnns[img_id].append(ann_dict)
self.anns[ann_dict['id']] = ann_dict
def info(self):
"""
Print information about the annotation file.
:return:
"""
for key, value in self.dataset['info'].items():
print('{}: {}'.format(key, value))
def getAllIds(self):
'''
Get all integer-valued ids for all images/annotations/categories.
'''
ids = []
ids.extend(self.imgs)
ids.extend(self.anns)
ids.extend(self.cats)
return set(ids)
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
"""
Get ann ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get anns for given imgs
catIds (int array) : get anns for given cats
areaRng (float array) : get anns for given area range (e.g. [0 inf])
iscrowd (boolean) : get anns for given crowd label (False or True)
:return: ids (int array) : integer array of ann ids
"""
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(imgIds) == len(catIds) == len(areaRng) == 0:
anns = self.dataset['annotations']
else:
if not len(imgIds) == 0:
lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.dataset['annotations']
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
if iscrowd is not None:
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
else:
ids = [ann['id'] for ann in anns]
return ids
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
"""
filtering parameters. default skips that filter.
:param catNms (str array) : get cats for given cat names
:param supNms (str array) : get cats for given supercategory names
:param catIds (int array) : get cats for given cat ids
:return: ids (int array) : integer array of cat ids
"""
catNms = catNms if _isArrayLike(catNms) else [catNms]
supNms = supNms if _isArrayLike(supNms) else [supNms]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(catNms) == len(supNms) == len(catIds) == 0:
cats = self.dataset['categories']
else:
cats = self.dataset['categories']
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
ids = [cat['id'] for cat in cats]
return ids
def getImgIds(self, imgIds=[], catIds=[]):
'''
Get img ids that satisfy given filter conditions.
:param imgIds (int array) : get imgs for given ids
:param catIds (int array) : get imgs with all given cats
:return: ids (int array) : integer array of img ids
'''
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
catIds = catIds if _isArrayLike(catIds) else [catIds]
if len(imgIds) == len(catIds) == 0:
ids = self.imgs.keys()
else:
ids = set(imgIds)
for i, catId in enumerate(catIds):
if i == 0 and len(ids) == 0:
ids = set(self.catToImgs[catId])
else:
ids &= set(self.catToImgs[catId])
return list(ids)
def loadAnns(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying anns
:return: anns (object array) : loaded ann objects
"""
if _isArrayLike(ids):
return [self.anns[id] for id in ids]
elif type(ids) == int:
return [self.anns[ids]]
def loadCats(self, ids=[]):
"""
Load cats with the specified ids.
:param ids (int array) : integer ids specifying cats
:return: cats (object array) : loaded cat objects
"""
if _isArrayLike(ids):
return [self.cats[id] for id in ids]
elif type(ids) == int:
return [self.cats[ids]]
def loadImgs(self, ids=[]):
"""
Load anns with the specified ids.
:param ids (int array) : integer ids specifying img
:return: imgs (object array) : loaded img objects
"""
if _isArrayLike(ids):
return [self.imgs[id] for id in ids]
elif type(ids) == int:
return [self.imgs[ids]]
class CoCoAuxData:
def __init__(self, img_dir):
'''
Class for scanning over a CoCo-style dataset and adding some auxiliary
information for each of the underlying images.
Input:
img_dir : directory where underlying images are stored
'''
self.img_dir = img_dir
def new_aux_data(self, start_idx, num_samples, predictor, ann_in_file, ann_out_file, aux_out_dir,
num_boxes=50, img_extension='.jpg', real_aux_data=False):
'''
Compute totally new auxiliary data for the underlying CoCo dataset.
Input:
predictor : detectron2-style model to use for boxes and features
ann_in_file : input file for CoCo JSON with source annotations
ann_out_file : output file for CoCo JSON with new annotations
aux_out_dir : directory to write the new auxiliary data to
num_boxes : number of boxes to detect for each image
img_extension : file extension for the source images
real_aux_data : whether to compute real aux data or dummy data
Output:
-- writes some files...
'''
# load the source coco annotations/dataset
self.coco = CustomCoCo(ann_in_file)
# make a directory for storing auxiliary files if it doesn't exist
if not os.path.isdir(aux_out_dir):
os.mkdir(aux_out_dir)
# get max id of any item in dataset, to let us generate new ids
# TODO: improve method for assigning unique IDs
# max_old_id = max(self.coco.getAllIds())
# min_new_id = max_old_id + 1
# make an auxiliary data file and json annotation for each image
tic = time.time()
print('Adding auxiliary data to {0:d} images:'.format(len(self.coco.imgs)))
raw_images, aux_anns, aux_data_dicts = [], [], []
for i in range(start_idx, start_idx+num_samples):
# for i, img_id in enumerate(self.coco.imgs):
# figure out what to name the auxiliary file for this image
if i not in self.coco.imgs or os.path.isfile('/gcc/GCC/train/AUX_train/{:07.0f}.pkl'.format(i)):
continue
img = self.coco.imgs[i]
#img = self.coco.imgs[img_id]
img_name = img['file_name']
if img_name.endswith(img_extension):
aux_name = img_name.replace(img_extension, '.pkl')
else:
assert False, 'Invalid image file extension!'
# load the image from disk...
img_path = os.path.join(self.img_dir, img_name)
try:
img_data = load_image(img_path, img_format='cv2')
except Exception as e:
print(e)
continue
raw_images.append(img_data)
# make an annotation json dict for this image's aux data
aux_ann = {'id': img['id'],
'image_id': img['id'],
'aux_file': aux_name}
aux_anns.append(aux_ann)
if len(raw_images) == 5:
# compute and dump auxiliary data for batches of 10 images
aux_data_dicts = \
self._compute_aux_data(raw_images, predictor, num_boxes, real_aux_data)
self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
raw_images, aux_anns, aux_data_dicts = [], [], []
if ((i + 1) % 100) == 0:
# derp, progress indicator, derp
img_per_sec = 100. / (time.time() - tic)
print('-- {0:7d} images completed, {1:.2f} img/sec...'.format((i + 1), img_per_sec))
tic = time.time()
if (len(raw_images) > 0) and (len(raw_images) < 5):
# compute and dump auxiliary data for final batch of <10 images
aux_data_dicts = \
self._compute_aux_data(raw_images, predictor, num_boxes, real_aux_data)
self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
# store dataset json file including update for new annotations
#self.coco.saveDataset(ann_out_file)
print('Writing updated dataset to JSON: {0:s}, Good Job!'.format(ann_out_file))
return
def new_aux_ftrs(self, encoder, ann_in_file, ann_out_file,
aux_in_dir, aux_out_dir, real_aux_data=False):
'''
Given a CoCo dataset with existing bbox annotations, compute new
auxiliary features for each bbox using resized crops from the raw images
as input to an encoder (rather than box features from the MaskRCNN).
-- we want to decouple box quality/cost from feature quality/cost
Input:
encoder : encoder to turn cropped+resized regions into features
ann_in_file : input file for CoCo JSON with new annotations
ann_out_file : output file for CoCo JSON with new annotations
aux_in_dir : directory to load existing auxiliary data from
aux_out_dir : directory to write new auxiliary data to
real_aux_data : whether to compute real aux data or dummy data
Output:
-- writes some files...
'''
# load the source coco annotations/dataset
self.coco = CustomCoCo(ann_in_file)
# make a directory for storing auxiliary files if it doesn't exist
if not os.path.isdir(aux_out_dir):
os.mkdir(aux_out_dir)
# make an auxiliary data file and json annotation for each image
tic = time.time()
print('Computing new auxiliary features for {0:d} images:'.format(len(self.coco.imgs)))
raw_images, aux_anns, aux_data_dicts = [], [], []
for i, img_id in enumerate(self.coco.imgs):
# fetch image info from the coco manager
img = self.coco.imgs[img_id]
img_file = img['file_name']
# fetch the image from disk...
img_path = os.path.join(self.img_dir, img_file)
raw_images.append(load_image(img_path, img_format='cv2'))
# fetch the auxiliary data annotation for this image
img_anns = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
aux_ann = [ann for ann in img_anns if ('aux_file' in ann)][0]
aux_anns.append(aux_ann)
# fetch the auxiliary data for this image
aux_file = os.path.join(aux_in_dir, aux_ann['aux_file'])
aux_data_dict = pickle.load(open(aux_file, 'rb'))
aux_data_dicts.append(aux_data_dict)
# ...
if len(raw_images) == 5:
# compute new auxiliary features for batches of 10 images
aux_data_dicts = \
self._compute_aux_features(encoder, raw_images, aux_data_dicts, real_aux_data)
self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
raw_images, aux_anns, aux_data_dicts = [], [], []
if ((i + 1) % 100) == 0:
# derp, progress indicator, derp
img_per_sec = 100. / (time.time() - tic)
print('-- {0:7d} images completed, {1:.2f} img/sec...'.format((i + 1), img_per_sec))
tic = time.time()
if (len(raw_images) > 0) and (len(raw_images) < 5):
# compute and dump auxiliary data for final batch of <10 images
aux_data_dicts = \
self._compute_aux_features(encoder, raw_images, aux_data_dicts, real_aux_data)
self._write_aux_data(aux_out_dir, aux_anns, aux_data_dicts)
# store dataset json file including update for new annotations
self.coco.saveDataset(ann_out_file)
print('Writing updated dataset to JSON: {0:s}, Good Job!'.format(ann_out_file))
return
def _compute_aux_data(self, raw_images, predictor, num_boxes, real_aux_data):
'''
Compute an auxiliary data dict for each image in raw_images.
Input:
raw_images : list of images as numpy arrays in 'cv2' format
real_aux_data : flag for whether to compute real or dummy aux data
Output:
aux_data_dicts : a dict of aux data for each image in raw_images
'''
if real_aux_data:
# compute auxiliary data using a detectron2-style model
instances, features, probs = \
extract_raw_features(raw_images, predictor, num_boxes)
aux_data_dicts = process_raw_features(raw_images, instances, features, probs)
else:
aux_data_dicts = []
for raw_img in raw_images:
# make a dummy auxiliary data file and write it to disk...
aux_data_dict = dummy_aux_data_dict(50)
aux_data_dicts.append(aux_data_dict)
return aux_data_dicts
def _compute_aux_features(self, encoder, raw_images, aux_data_dicts, real_aux_data):
'''
Compute new per-box features for these images, using this encoder.
'''
if real_aux_data:
aux_data_dicts = update_box_features(encoder, raw_images, aux_data_dicts)
else:
pass
return aux_data_dicts
def _write_aux_data(self, aux_dir, aux_anns, aux_data_dicts):
'''
Write batch of auxiliary data to disk and update coco annotation file.
Input:
aux_anns : list of coco annotations linking aux data to image
aux_data_dicts : list of dicts holding aux data for each image
'''
for aux_ann, aux_data_dict in zip(aux_anns, aux_data_dicts):
# add annotation to the coco annotation file
self.coco.addAnn(aux_ann)
# save auxiliary data to some file
aux_file = open(os.path.join(aux_dir, aux_ann['aux_file']), 'wb')
pickle.dump(aux_data_dict, aux_file)
aux_file.close()
def test_new_stuff(i, ds_size, stuff_type='aux_data'):
# hardcoded BS for now
print('Loading default detectron2 predictor...')
predictor, _ = load_predictor_coco()
predictor, _ = load_predictor_vg()
# set directories for source dataset
source_ann_json = '/gcc/GCC/train/captions_train.json'
source_img_dir = '/gcc/GCC/train/train'
# set directories for new annotation files
new_ann_json = '/gcc/GCC/train/AUX_train.json'
new_aux_data_dir = '/gcc/GCC/train/AUX_train'
# run the stuff...
print('Building CoCoAuxData...')
cad = CoCoAuxData(source_img_dir)
start_idx = 0
num = 10
if stuff_type == 'aux_data':
print('Adding auxiliary data to dataset...')
chunk_size = (ds_size // 4) + 1
start_idx = chunk_size * i
cad.new_aux_data(start_idx, chunk_size, predictor, source_ann_json, new_ann_json, new_aux_data_dir,
num_boxes=50, img_extension='.jpg', real_aux_data=True)
else:
print('Adding new auxiliary features to dataset...')
encoder = predictor
source_ann_json = new_ann_json
source_aux_data_dir = new_aux_data_dir
cad.new_aux_ftrs(encoder, source_ann_json, new_ann_json,
source_aux_data_dir, new_aux_data_dir,
real_aux_data=True)
##############
# EYE BUFFER #
##############
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment