millerdev · March 19, 2020 20:21
diff --git a/bucket_domains.py b/bucket_domains.py
 #!/usr/bin/env python
 # coding: utf-8
 """
 DEPRECATED moved to
 https://github.com/dimagi/commcare-hq/blob/master/corehq/apps/couch_sql_migration/management/commands/couch_domain_report.py

 Categorize CommCare HQ domains needing couch -> sql migration of forms and cases

 This script requires a domains.csv file, which can be obtained by following
 these steps:

 1.  Download Excel file containing domains using CouchDB for forms and cases
    from https://www.commcarehq.org/hq/admin/domains/?es_use_sql_backend=F
 2.  Open the file and remove the (last three) summary rows.
 3.  Export to CSV format.

 The output is a set of text files listing domains in each category.
 Output filenames are formatted as "{ORIGIN}_{category}.txt"

 This script lives at
 https://gist.github.com/millerdev/79d34a26a2f325125a32c65dd5315c50

 Usage:
    bucket_domains.py [options] DOMAINS_CSV ORIGIN

 -h --help           Show this help.
 --output-dir=PATH   Output directory [default: .]
 --plot              Show a pie graph of domain categories.
                    Requires extra setup:

                        python3 -m venv /path/to/virtualenv
                        # activate newly craeted virtualenv
                        pip install docopt pandas matplotlib
 """
 import sys
 from os.path import join

 try:
    import pandas as pd
    from docopt import docopt
 except ImportError:
    print("Pre-setup required:\npip install docopt pandas")
    sys.exit(1)

 NOW = pd.datetime.now()


 def main():
    args = docopt(__doc__)
    data = pd.read_csv(args["DOMAINS_CSV"])
    origin = args["ORIGIN"]
    output_dir = args["--output-dir"]

    categories = {}
    all_domains = len(data)
    total_domains = 0

    print("{:<15} {:>6} {:>12}  {}".format(
        "category",
        "domains",
        "submissions",
        "migration time",
    ))
    print("--")
    for name, query in [
        ("weird", get_weird),  # mistakes?

        ("small", get_small(2000)),
        ("smallish", get_small(3500)),
        ("smallesque", get_small(5000)),

        ("fossilized", get_old(years=5)),
        ("inactive", get_old(years=1, limit=10000)),
        ("inactive_large", get_old(years=1, limit=100000)),

        ("super_large", get_large(720000)),
        ("large", get_large(60000)),

        ("moderate", lambda d: d),
    ]:
        categories[name] = query(data)
        data = left_xor(data, categories[name])

        if name == "weird":
            forms = time_to_complete = "unknown"
        else:
            forms = categories[name]['# Form Submissions'].apply(pd.to_numeric).sum()
            time_to_complete = get_time_to_complete(forms)
        total_domains += len(categories[name])

        print("{:<15} {:>6} {:>12}   {}".format(
            name,
            len(categories[name]),
            forms,
            time_to_complete,
        ))

        path = join(output_dir, "%s_%s.txt" % (origin, name))
        with open(path, "w", encoding="utf-8") as fh:
            fh.write("\n".join(categories[name]['Project']))

    assert total_domains == all_domains, (total_domains, all_domains)
    print("--")
    print("total {:>16}".format(total_domains))

    if args["--plot"]:
        show_pie_graph(categories)

 def get_weird(data):
    return data[data["# Active Mobile Workers"] == "Not yet calculated"]


 def get_small(upper_limit):
    def query(data):
        return data[data['# Form Submissions'].apply(pd.to_numeric) < upper_limit]
    return query


 def get_large(lower_limit):
    def query(data):
        return data[data['# Form Submissions'].apply(pd.to_numeric) > lower_limit]
    return query


 def get_old(*, years, limit=None):
    years_ago = NOW.replace(year=NOW.year - years)
    def query(data):
        where = data['Last Form Submission'].apply(pd.to_datetime) <= years_ago
        if limit is not None:
            where &= data['# Form Submissions'].apply(pd.to_numeric) < limit
        return data[where]
    return query


 def left_xor(left, right):
    merged = left.merge(right, indicator=True, how='outer')
    merged = merged[merged['_merge'] == 'left_only']
    return merged.drop('_merge', 1)


 def get_time_to_complete(forms):
    # Estimated migration throughput is about 24 forms/sec in the first
    # phase. Other phases add extra time. Estimate low here. And always
    # add more buffer time if/when sharing these numbers.
    num = forms / 10  # 10 forms/sec overall
    for divisor, name, next_div in [
        (60, "minutes", 60),
        (60, "hours", 24),
        (24, "days", 7),
        (7, "weeks", 4),
    ]:
        num = num / divisor
        if num < next_div * 2:
            break
    return "%.1f %s" % (num, name)


 def show_pie_graph(categories):
    import matplotlib.pyplot as plt

    sizes = []
    labels = []
    for name, data in categories.items():
        sizes.append(len(data))
        labels.append(name + " (%d)" % len(data))

    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, startangle=90)
    ax1.axis('equal')
    plt.show()


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# coding: utf-8
	"""
	DEPRECATED moved to
	https://github.com/dimagi/commcare-hq/blob/master/corehq/apps/couch_sql_migration/management/commands/couch_domain_report.py

	Categorize CommCare HQ domains needing couch -> sql migration of forms and cases

	This script requires a domains.csv file, which can be obtained by following
	these steps:

	1. Download Excel file containing domains using CouchDB for forms and cases
	from https://www.commcarehq.org/hq/admin/domains/?es_use_sql_backend=F
	2. Open the file and remove the (last three) summary rows.
	3. Export to CSV format.

	The output is a set of text files listing domains in each category.
	Output filenames are formatted as "{ORIGIN}_{category}.txt"

	This script lives at
	https://gist.github.com/millerdev/79d34a26a2f325125a32c65dd5315c50

	Usage:
	bucket_domains.py [options] DOMAINS_CSV ORIGIN

	-h --help Show this help.
	--output-dir=PATH Output directory [default: .]
	--plot Show a pie graph of domain categories.
	Requires extra setup:

	python3 -m venv /path/to/virtualenv
	# activate newly craeted virtualenv
	pip install docopt pandas matplotlib
	"""
	import sys
	from os.path import join

	try:
	import pandas as pd
	from docopt import docopt
	except ImportError:
	print("Pre-setup required:\npip install docopt pandas")
	sys.exit(1)

	NOW = pd.datetime.now()


	def main():
	args = docopt(__doc__)
	data = pd.read_csv(args["DOMAINS_CSV"])
	origin = args["ORIGIN"]
	output_dir = args["--output-dir"]

	categories = {}
	all_domains = len(data)
	total_domains = 0

	print("{:<15} {:>6} {:>12} {}".format(
	"category",
	"domains",
	"submissions",
	"migration time",
	))
	print("--")
	for name, query in [
	("weird", get_weird), # mistakes?

	("small", get_small(2000)),
	("smallish", get_small(3500)),
	("smallesque", get_small(5000)),

	("fossilized", get_old(years=5)),
	("inactive", get_old(years=1, limit=10000)),
	("inactive_large", get_old(years=1, limit=100000)),

	("super_large", get_large(720000)),
	("large", get_large(60000)),

	("moderate", lambda d: d),
	]:
	categories[name] = query(data)
	data = left_xor(data, categories[name])

	if name == "weird":
	forms = time_to_complete = "unknown"
	else:
	forms = categories[name]['# Form Submissions'].apply(pd.to_numeric).sum()
	time_to_complete = get_time_to_complete(forms)
	total_domains += len(categories[name])

	print("{:<15} {:>6} {:>12} {}".format(
	name,
	len(categories[name]),
	forms,
	time_to_complete,
	))

	path = join(output_dir, "%s_%s.txt" % (origin, name))
	with open(path, "w", encoding="utf-8") as fh:
	fh.write("\n".join(categories[name]['Project']))

	assert total_domains == all_domains, (total_domains, all_domains)
	print("--")
	print("total {:>16}".format(total_domains))

	if args["--plot"]:
	show_pie_graph(categories)

	def get_weird(data):
	return data[data["# Active Mobile Workers"] == "Not yet calculated"]


	def get_small(upper_limit):
	def query(data):
	return data[data['# Form Submissions'].apply(pd.to_numeric) < upper_limit]
	return query


	def get_large(lower_limit):
	def query(data):
	return data[data['# Form Submissions'].apply(pd.to_numeric) > lower_limit]
	return query


	def get_old(*, years, limit=None):
	years_ago = NOW.replace(year=NOW.year - years)
	def query(data):
	where = data['Last Form Submission'].apply(pd.to_datetime) <= years_ago
	if limit is not None:
	where &= data['# Form Submissions'].apply(pd.to_numeric) < limit
	return data[where]
	return query


	def left_xor(left, right):
	merged = left.merge(right, indicator=True, how='outer')
	merged = merged[merged['_merge'] == 'left_only']
	return merged.drop('_merge', 1)


	def get_time_to_complete(forms):
	# Estimated migration throughput is about 24 forms/sec in the first
	# phase. Other phases add extra time. Estimate low here. And always
	# add more buffer time if/when sharing these numbers.
	num = forms / 10 # 10 forms/sec overall
	for divisor, name, next_div in [
	(60, "minutes", 60),
	(60, "hours", 24),
	(24, "days", 7),
	(7, "weeks", 4),
	]:
	num = num / divisor
	if num < next_div * 2:
	break
	return "%.1f %s" % (num, name)


	def show_pie_graph(categories):
	import matplotlib.pyplot as plt

	sizes = []
	labels = []
	for name, data in categories.items():
	sizes.append(len(data))
	labels.append(name + " (%d)" % len(data))

	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, startangle=90)
	ax1.axis('equal')
	plt.show()


	if __name__ == "__main__":
	main()