Skip to content

Instantly share code, notes, and snippets.

@adammcmaster
Created January 14, 2020 16:13
Show Gist options
  • Save adammcmaster/de82785145e091134e297d5baea4ff29 to your computer and use it in GitHub Desktop.
Save adammcmaster/de82785145e091134e297d5baea4ff29 to your computer and use it in GitHub Desktop.
A script to produce an AutoML training file from Zooniverse classifications for P4 Ridges
import csv
import json
import os
import re
subject_sets = {
'7585',
'7870',
'7888',
'8385',
'9845',
'13286',
}
workflow_id = '2627'
GCS_PATH_PREFIX = 'gs://golden-toolbox-239212-vcm/'
RIDGE_THRESHOLD = 0.75
FILE_PATH = os.path.join(os.environ['HOME'], 'Downloads')
subjects = {}
print('Loading subjects')
with open(os.path.join(FILE_PATH, 'planet-four-ridges-subjects.csv')) as subj_file:
r = csv.DictReader(subj_file)
for row in r:
if not row['subject_set_id'] in subject_sets:
continue
location = re.sub(r'^https://', GCS_PATH_PREFIX, json.loads(row['locations']).values()[0])
subjects[row['subject_id']] = {
'location': location,
'yes_votes': 0.0,
'no_votes': 0.0,
}
print('Counting votes from classifications')
with open(os.path.join(FILE_PATH, 'polygonal-ridge-workflow-classifications.csv')) as class_file:
r = csv.DictReader(class_file)
for row in r:
if (
row['workflow_id'] != workflow_id
or row['subject_ids'] not in subjects
):
continue
annotations = json.loads(row['annotations'])
for annotation in annotations:
if annotation['task'] == 'T0' and annotation['value']:
if annotation['value'].lower() == 'yes':
subjects[row['subject_ids']]['yes_votes'] += 1.0
elif annotation['value'].lower() == 'no':
subjects[row['subject_ids']]['no_votes'] += 1.0
break
print('Outputting results')
with open(os.path.join(FILE_PATH, 'planet-four-ridges-automl.csv'), 'w') as out_file:
w = csv.writer(out_file)
for subject_id, subject in subjects.iteritems():
vote_total = subject['yes_votes'] + subject['no_votes']
if vote_total == 0:
continue
yes_proportion = subject['yes_votes'] / vote_total
if yes_proportion >= RIDGE_THRESHOLD:
subject_class = 'ridge'
else:
subject_class = 'no_ridge'
w.writerow((subject['location'], subject_class))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment