Last active
October 18, 2017 06:48
-
-
Save InnerPeace-Wu/49567f6008d4b9a841d816c6cb2f5a1c to your computer and use it in GitHub Desktop.
Dealing with the problem that without enough memory to read the whole region description json file of visual genome dataset.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ---------------------------------------------- | |
# DenseCap | |
# Written by InnerPeace | |
# ---------------------------------------------- | |
"""read large region description json file""" | |
import ijson | |
import json | |
import sys | |
import os | |
VG_VERSION = '1.2' | |
VG_PATH = '/path/to/visual/genome/data' | |
VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION) | |
REGION_JSON = '%s/%s/regions_disc' % (VG_PATH, VG_VERSION) | |
def read_regions( ): | |
if not os.path.exists(REGION_JSON): | |
os.makedirs(REGION_JSON) | |
parser = ijson.parse(open(VG_REGION_PATH)) | |
last_value = None | |
Dic = {} | |
regions = [] | |
dic = {} | |
count = 0 | |
for prefix, event, value in parser: | |
sys.stdout.write('>>> %d \r' % count) | |
sys.stdout.flush() | |
if value == 'regions': | |
Dic = {} | |
regions = [] | |
last_value = None | |
elif last_value == 'id' and value: | |
count += 1 | |
Dic['regions'] = regions | |
Dic['id'] = value | |
with open(REGION_JSON + '/%s.json' % value, 'w') as f: | |
json.dump(Dic, f) | |
elif event == 'map_key': | |
last_value = value | |
elif event == 'end_map': | |
regions.append(dic) | |
dic = {} | |
last_value = None | |
elif last_value: | |
dic[last_value] = value | |
if __name__ == '__main__': | |
read_regions() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment