Skip to content

Instantly share code, notes, and snippets.

@millerdev
Last active March 19, 2020 20:21
Show Gist options
  • Save millerdev/79d34a26a2f325125a32c65dd5315c50 to your computer and use it in GitHub Desktop.
Save millerdev/79d34a26a2f325125a32c65dd5315c50 to your computer and use it in GitHub Desktop.
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases
#!/usr/bin/env python
# coding: utf-8
"""
DEPRECATED moved to
https://github.com/dimagi/commcare-hq/blob/master/corehq/apps/couch_sql_migration/management/commands/couch_domain_report.py
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases
This script requires a domains.csv file, which can be obtained by following
these steps:
1. Download Excel file containing domains using CouchDB for forms and cases
from https://www.commcarehq.org/hq/admin/domains/?es_use_sql_backend=F
2. Open the file and remove the (last three) summary rows.
3. Export to CSV format.
The output is a set of text files listing domains in each category.
Output filenames are formatted as "{ORIGIN}_{category}.txt"
This script lives at
https://gist.github.com/millerdev/79d34a26a2f325125a32c65dd5315c50
Usage:
bucket_domains.py [options] DOMAINS_CSV ORIGIN
-h --help Show this help.
--output-dir=PATH Output directory [default: .]
--plot Show a pie graph of domain categories.
Requires extra setup:
python3 -m venv /path/to/virtualenv
# activate newly craeted virtualenv
pip install docopt pandas matplotlib
"""
import sys
from os.path import join
try:
import pandas as pd
from docopt import docopt
except ImportError:
print("Pre-setup required:\npip install docopt pandas")
sys.exit(1)
NOW = pd.datetime.now()
def main():
args = docopt(__doc__)
data = pd.read_csv(args["DOMAINS_CSV"])
origin = args["ORIGIN"]
output_dir = args["--output-dir"]
categories = {}
all_domains = len(data)
total_domains = 0
print("{:<15} {:>6} {:>12} {}".format(
"category",
"domains",
"submissions",
"migration time",
))
print("--")
for name, query in [
("weird", get_weird), # mistakes?
("small", get_small(2000)),
("smallish", get_small(3500)),
("smallesque", get_small(5000)),
("fossilized", get_old(years=5)),
("inactive", get_old(years=1, limit=10000)),
("inactive_large", get_old(years=1, limit=100000)),
("super_large", get_large(720000)),
("large", get_large(60000)),
("moderate", lambda d: d),
]:
categories[name] = query(data)
data = left_xor(data, categories[name])
if name == "weird":
forms = time_to_complete = "unknown"
else:
forms = categories[name]['# Form Submissions'].apply(pd.to_numeric).sum()
time_to_complete = get_time_to_complete(forms)
total_domains += len(categories[name])
print("{:<15} {:>6} {:>12} {}".format(
name,
len(categories[name]),
forms,
time_to_complete,
))
path = join(output_dir, "%s_%s.txt" % (origin, name))
with open(path, "w", encoding="utf-8") as fh:
fh.write("\n".join(categories[name]['Project']))
assert total_domains == all_domains, (total_domains, all_domains)
print("--")
print("total {:>16}".format(total_domains))
if args["--plot"]:
show_pie_graph(categories)
def get_weird(data):
return data[data["# Active Mobile Workers"] == "Not yet calculated"]
def get_small(upper_limit):
def query(data):
return data[data['# Form Submissions'].apply(pd.to_numeric) < upper_limit]
return query
def get_large(lower_limit):
def query(data):
return data[data['# Form Submissions'].apply(pd.to_numeric) > lower_limit]
return query
def get_old(*, years, limit=None):
years_ago = NOW.replace(year=NOW.year - years)
def query(data):
where = data['Last Form Submission'].apply(pd.to_datetime) <= years_ago
if limit is not None:
where &= data['# Form Submissions'].apply(pd.to_numeric) < limit
return data[where]
return query
def left_xor(left, right):
merged = left.merge(right, indicator=True, how='outer')
merged = merged[merged['_merge'] == 'left_only']
return merged.drop('_merge', 1)
def get_time_to_complete(forms):
# Estimated migration throughput is about 24 forms/sec in the first
# phase. Other phases add extra time. Estimate low here. And always
# add more buffer time if/when sharing these numbers.
num = forms / 10 # 10 forms/sec overall
for divisor, name, next_div in [
(60, "minutes", 60),
(60, "hours", 24),
(24, "days", 7),
(7, "weeks", 4),
]:
num = num / divisor
if num < next_div * 2:
break
return "%.1f %s" % (num, name)
def show_pie_graph(categories):
import matplotlib.pyplot as plt
sizes = []
labels = []
for name, data in categories.items():
sizes.append(len(data))
labels.append(name + " (%d)" % len(data))
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, startangle=90)
ax1.axis('equal')
plt.show()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment