Skip to content

Instantly share code, notes, and snippets.

Last active April 26, 2020 08:54
Show Gist options
  • Save DataTurks/0cc13f59a423a9d4f039225bf4fc8e03 to your computer and use it in GitHub Desktop.
Save DataTurks/0cc13f59a423a9d4f039225bf4fc8e03 to your computer and use it in GitHub Desktop.
Covert Dataturks Image bounding box JSON to Pascal VOC format.
import argparse
import sys
import os
import json
import logging
import requests
from PIL import Image
################### INSTALLATION NOTE #######################
## pip install requests
## pip install pillow
#enable info logging.
def maybe_download(image_url, image_dir):
"""Download the image if not already exist, return the location path"""
fileName = image_url.split("/")[-1]
filePath = os.path.join(image_dir, fileName)
if (os.path.exists(filePath)):
return filePath
#else download the image
response = requests.get(image_url)
if response.status_code == 200:
with open(filePath, 'wb') as f:
return filePath
raise ValueError( "Not a 200 response")
except Exception as e:
logging.exception("Failed to download image at " + image_url + " \n" + str(e) + "\nignoring....")
raise e
def get_xml_for_bbx(bbx_label, bbx_data, width, height):
if len(bbx_data['points']) == 4:
#Regular BBX has 4 points of the rectangle.
xmin = width*min(bbx_data['points'][0][0], bbx_data['points'][1][0], bbx_data['points'][2][0], bbx_data['points'][3][0])
ymin = height * min(bbx_data['points'][0][1], bbx_data['points'][1][1], bbx_data['points'][2][1],
xmax = width * max(bbx_data['points'][0][0], bbx_data['points'][1][0], bbx_data['points'][2][0],
ymax = height * max(bbx_data['points'][0][1], bbx_data['points'][1][1], bbx_data['points'][2][1],
#OCR BBX format has 'x','y' in one point.
# We store the left top and right bottom as point '0' and point '1'
xmin = int(bbx_data['points'][0]['x']*width)
ymin = int(bbx_data['points'][0]['y']*height)
xmax = int(bbx_data['points'][1]['x']*width)
ymax = int(bbx_data['points'][1]['y']*height)
xml = "<object>\n"
xml = xml + "\t<name>" + bbx_label + "</name>\n"
xml = xml + "\t<pose>Unspecified</pose>\n"
xml = xml + "\t<truncated>Unspecified</truncated>\n"
xml = xml + "\t<difficult>Unspecified</difficult>\n"
xml = xml + "\t<occluded>Unspecified</occluded>\n"
xml = xml + "\t<bndbox>\n"
xml = xml + "\t\t<xmin>" + str(xmin) + "</xmin>\n"
xml = xml + "\t\t<xmax>" + str(xmax) + "</xmax>\n"
xml = xml + "\t\t<ymin>" + str(ymin) + "</ymin>\n"
xml = xml + "\t\t<ymax>" + str(ymax) + "</ymax>\n"
xml = xml + "\t</bndbox>\n"
xml = xml + "</object>\n"
return xml
def convert_to_PascalVOC(dataturks_labeled_item, image_dir, xml_out_dir):
"""Convert a dataturks labeled item to pascalVOCXML string.
dataturks_labeled_item: JSON of one labeled image from dataturks.
image_dir: Path to directory to downloaded images (or a directory already having the images downloaded).
xml_out_dir: Path to the dir where the xml needs to be written.
data = json.loads(dataturks_labeled_item)
if len(data['annotation']) == 0:"Ignoring Skipped Item");
return False;
width = data['annotation'][0]['imageWidth']
height = data['annotation'][0]['imageHeight']
image_url = data['content']
filePath = maybe_download(image_url, image_dir)
with as img:
width, height = img.size
fileName = filePath.split("/")[-1]
image_dir_folder_Name = image_dir.split("/")[-1]
xml = "<annotation>\n<folder>" + image_dir_folder_Name + "</folder>\n"
xml = xml + "<filename>" + fileName +"</filename>\n"
xml = xml + "<path>" + filePath +"</path>\n"
xml = xml + "<source>\n\t<database>Unknown</database>\n</source>\n"
xml = xml + "<size>\n"
xml = xml + "\t<width>" + str(width) + "</width>\n"
xml = xml + "\t<height>" + str(height) + "</height>\n"
xml = xml + "\t<depth>Unspecified</depth>\n"
xml = xml + "</size>\n"
xml = xml + "<segmented>Unspecified</segmented>\n"
for bbx in data['annotation']:
if not bbx:
#Pascal VOC only supports rectangles.
if "shape" in bbx and bbx["shape"] != "rectangle":
bbx_labels = bbx['label']
#handle both list of labels or a single label.
if not isinstance(bbx_labels, list):
bbx_labels = [bbx_labels]
for bbx_label in bbx_labels:
xml = xml + get_xml_for_bbx(bbx_label, bbx, width, height)
xml = xml + "</annotation>"
#output to a file.
xmlFilePath = os.path.join(xml_out_dir, fileName + ".xml")
with open(xmlFilePath, 'w') as f:
return True
except Exception as e:
logging.exception("Unable to process item " + dataturks_labeled_item + "\n" + "error = " + str(e))
return False
def main():
#make sure everything is setup.
if (not os.path.isdir(image_download_dir)):
logging.exception("Please specify a valid directory path to download images, " + image_download_dir + " doesn't exist")
if (not os.path.isdir(pascal_voc_xml_dir)):
logging.exception("Please specify a valid directory path to write Pascal VOC xml files, " + pascal_voc_xml_dir + " doesn't exist")
if (not os.path.exists(dataturks_JSON_FilePath)):
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " doesn't exist")
lines = []
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
if (not lines or len(lines) == 0):
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " is empty")
count = 0;
success = 0
for line in lines:
status = convert_to_PascalVOC(line, image_download_dir, pascal_voc_xml_dir)
if (status):
success = success + 1
if (count % 10 == 0): + " items done ...")"Completed: " + str(success) + " items done, " + str(len(lines) - success) + " items ignored due to errors or for being skipped items.")
def create_arg_parser():
""""Creates and returns the ArgumentParser object."""
parser = argparse.ArgumentParser(description='Converts Dataturks output JSON file for Image bounding box to Pascal VOC format.')
help='Path to the JSON file downloaded from Dataturks.')
help='Path to the directory where images will be dowloaded (if not already found in the directory).')
help='Path to the directory where Pascal VOC XML files will be stored.')
return parser
if __name__ == '__main__':
arg_parser = create_arg_parser()
parsed_args = arg_parser.parse_args(sys.argv[1:])
global dataturks_JSON_FilePath
global image_download_dir
global pascal_voc_xml_dir
#setup global paths needed accross the script.
dataturks_JSON_FilePath = parsed_args.dataturks_JSON_FilePath
image_download_dir = parsed_args.image_download_dir
pascal_voc_xml_dir = parsed_args.pascal_voc_xml_dir
Copy link

ht9999 commented Sep 16, 2019

i am getting error in line 141 in function convert_to_pascal. Error is in line with open(xmlfilepath,'w') in command prompt.Please help

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment