indygreg · September 23, 2015 14:57
diff --git a/moz-survey-analysis.py b/moz-survey-analysis.py
 #!/usr/bin/env python2.7

 import collections
 import csv
 import re
 import sys

 #import plotly.plotly as py
 #from plotly.graph_objs import *
 from bokeh.charts import Bar, output_file, show, save
 from bokeh.embed import components

 RE_QUESTION_CHOICE = re.compile('^([^\]]+)\[(.*)\]$')

 TEAM_QUESTION = 'What product do you work on the most?'
 MULTIPLE_CHOICE_QUESTIONS = {
    'What other languages do you work with?',
    'What products do you work on? (Check all that apply)',
    'What would improve your Treeherder experience?',
    'What are your thoughts on MQ?',
    'Which version control tools do you currently use for Mozilla projects?',
    'Which workflows do you practice to author commits with Mercurial? (Check all that apply)',
    'Why do you prefer Git over Mercurial? (check all that apply)',
    'Why do you prefer Mercurial over Git? (Check all that apply)',
    'If you use Git for working on Firefox, how do you obtain the source code? (Choose all that apply)',
    'What version control tools do you use to work with mozilla-central and other project branches (inbound, fx-team, etc)?',
    'How do you prefer to debug test failures?',
    'Which version control tool do you prefer?',
 }
 OPEN_TEXT_QUESTIONS = {
    'Please leave any other comments you may have on the Firefox build system and how improvements could affect your productivity.',
    'Other',
    'Please give more info in your answer(s) above',
    'What\'s your biggest complaint about Mercurial as a general tool?',
    'What\'s your biggest complaint about Mercurial at Mozilla?',
    'Other feedback about Mercurial',
    'What\'s your biggest complaint about Git?',
    'Other feedback about Git at Mozilla',
    'In your own words, feel free to tell us more about why you prefer Mercurial over Git.',
    'In your own words, feel free to tell us why you prefer Git over Mercurial.',
    'Before you go, what things did we fail to cover that would provide a boost to your productivity? Please be as specific as possible.',
 }

 PRODUCT_SHORTNAMES = {
    'Gecko / Platform': 'Platform',
    'Firefox for Desktop (Windows, OS X, Linux)': 'Fx-Team',
    'Firefox OS': 'FxOS',
    'Firefox for Android': 'Fennec',
    'Firefox for iOS': 'iOS',
    'Product Support (automation, tools, infrastructure, etc)': 'Support',
    'Other': 'Other',
    'All Other Responses': 'Other',
 }

 FILTER_PRODUCTS = {
    'Gecko / Platform',
    'Firefox for Desktop (Windows, OS X, Linux)',
    #'Firefox OS',
    #'Product Support (automation, tools, infrastructure, etc)',
 }

 SORTS = [
    [
        'No',
        'Yes',
    ],
    [
        "This is my first job / I'm not sure",
        'Slower',
        'About the same',
        'Faster',
    ],
    [
        'Not sure',
        'No more productive - already at peak productivity',
        'Up to 10% more productive',
        'Up to 25% more productive',
        'Up to 50% more productive',
        "100%+ more productive (you'd be a machine)",
    ],
    [
        "N/A or I'm not sure",
        'Not at all satisfied',
        "It's not horrible, but it's still pretty bad",
        'It sucks a little',
        "It's neither bad nor good",
        "It's OK, I guess",
        "It's pretty good",
        "It's awesome!",
    ],
    [
        'N/A',
        "It's gotten worse",
        'About the same',
        'Gotten better',
    ],
    [
        'N/A',
        '1 = least impactful',
        '2',
        '3',
        '4 = most impactful',
    ],
    [
        'N/A',
        '1 = little impact',
        '2',
        '3',
        '4 = most impact',
    ],
    [
        'N/A',
        '1 = least impact',
        '2',
        '3',
        '4 = most impact',
    ],
    [
        '1 - no impact',
        '2 - little impact',
        '3 - a fair amount of impact',
        '4 - a significant amount of impact',
        '5 - an extremely frustrating amount of impact',
    ],
    [
        'N/A',
        'Keep about the same',
        'Increase investment',
        'Drastically increase investment',
    ],
    [
        'N/A',
        'Extremely dissatisfied',
        'Below average',
        'Average',
        'Above average',
        "It's awesome!",
    ],
    [
        'No impact / Not applicable',
        'A little impact',
        'Moderate impact',
        'Significant impact',
        'Tons of impact / Implement this ASAP',
    ],
    [
        'No knowledge',
        'I know enough to do just the basics',
        'Fairly competent (I still get stuck from time to time)',
        'Highly competent',
    ],
    [
        'N/A',
        'Once every few months',
        'Monthly',
        'Weekly',
        'Daily',
    ],
    [
        'Never',
        'Over a year ago',
        'A few months ago',
        'A few weeks ago',
        'A few days ago',
    ],
    [
        'N/A',
        'Not satisfied at all',
        'Below average',
        'Average',
        'Above average',
        "It's awesome!",
    ],
    [
        'N/A (not relevant to me)',
        'Cut all investment',
        'Cut some investment',
        'Keep about the same',
        'Increase investment',
        'Drastically increase investment',
    ],
    [
        'N/A',
        'Cut all investment',
        'Decrease investment',
        'Keep about the same',
        'Increase investment',
        'Drastically increase investment',
    ],
 ]


 def read_csv(fh):
    reader = csv.reader(fh)
    columns = reader.next()
    responses = list(reader)

    return columns, responses


 def iterate_answers(column, v):
    if column in MULTIPLE_CHOICE_QUESTIONS:
        for c in v.split(';'):
            c = c.strip()
            if c:
                yield c
    else:
        yield v


 def count_group_size(columns, responses):
    for i, column in enumerate(columns):
        if column != TEAM_QUESTION:
            continue

        counts = collections.Counter()
        for r in responses:
            counts[r[i]] += 1

        return counts


 def reduce_answers(columns, counts, group_sizes, ignore=False):
    """Replace low frequency answers with "all other responses."""
    new_counts = {}
    for column in columns:
        if column not in counts:
            continue

        v = counts[column]
        answer_counts = collections.Counter()
        total = 0

        if isinstance(v, collections.defaultdict):
            for a, group_counts in v.items():
                for group, c in group_counts.items():
                    if group not in FILTER_PRODUCTS:
                        continue
                    answer_counts[a] += c
                    total += c

            if column in MULTIPLE_CHOICE_QUESTIONS:
                total = 0
                for group, c in group_sizes.items():
                    if group in FILTER_PRODUCTS:
                        total += c
        else:
            for a, c in v.items():
                answer_counts[a] += c
                total += c

        to_filter = set()
        for a, c in answer_counts.items():
            if float(c) / float(total) <= 0.05:
                to_filter.add(a)
            if c <= 2:
                to_filter.add(a)

        if isinstance(v, collections.defaultdict):
            new_counts[column] = collections.defaultdict(collections.Counter)
            for a, group_counts in v.items():
                if a in to_filter:
                    if ignore:
                        continue
                    a = 'All Other Responses'

                for group, c in group_counts.items():
                    if group not in FILTER_PRODUCTS:
                        continue
                    new_counts[column][a][group] += c

        else:
            new_counts[column] = collections.Counter()
            for a, c in v.items():
                if a in to_filter:
                    if ignore:
                        continue
                    a = 'All Other Responses'

                new_counts[column][a] += c

    return new_counts


 def counts_by_question(columns, responses):
    counts = {}

    for i, column in enumerate(columns):
        if i < 2:
            continue

        if column in OPEN_TEXT_QUESTIONS:
            continue

        counts[column] = collections.Counter()

        for r in responses:
            v = r[i]
            if not v:
                continue

            for a in iterate_answers(column, v):
                counts[column][a] += 1

    return counts

 def counts_grouped_by_answer(columns, responses, question):
    group_index = columns.index(question)

    counts = {}

    for i, column in enumerate(columns):
        if i < 2:
            continue

        if i == group_index:
            continue

        if column in OPEN_TEXT_QUESTIONS:
            continue

        counts[column] = collections.defaultdict(collections.Counter)

        for r in responses:
            v = r[i]
            if not v:
                continue

            group = r[group_index]
            for a in iterate_answers(column, v):
                counts[column][a][group] += 1

    return counts


 def sort_answers(answers):
    """Given an iterable of answers, sort according to proper order."""
    srtd = None
    for l in SORTS:
        relevant = set(a for a in answers if a != 'All Other Responses')
        if all(a in l for a in relevant):
            srtd = list(l)
            if 'All Other Responses' in answers:
                srtd.insert(0, 'All Other Responses')
            break

    if not srtd:
        return sorted(answers)

    return [a for a in srtd if a in answers]


 def plot_by_group(columns, answers, group_sizes):
    plots = []

    for i, column in enumerate(columns):
        if column not in answers:
            continue

        all_answers = set(answers[column].keys())
        counts_by_group = collections.Counter()
        for counts in answers[column].values():
            for group, count in counts.items():
                counts_by_group[group] += count

        bars = []
        x = sort_answers(all_answers)
        data = {}

        for group in sorted(counts_by_group):
            y = []

            for answer in x:
                counts = answers[column][answer]
                if group not in counts:
                    y.append(0)
                else:
                    percent = float(counts[group]) / float(group_sizes[group])
                    y.append(int(percent * 100.0))

            data[PRODUCT_SHORTNAMES[group]] = y

        b = Bar(data,
            cat=x,
            title=column,
            ylabel='Percentage',
            width=1280,
            height=560,
            legend=True,
            tools='',
        )
        b.title_text_font_size = '13px'
        plots.append(b)

    script, divs = components(plots)
    return script, divs


 def write_html(fh, script, divs):
    fh.write('<html><head><title>Survey Results</title>')
    fh.write('<link href="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.css" rel="stylesheet" type="text/css" />')
    fh.write('<script src="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.js"></script>')
    fh.write(script)
    fh.write('</head><body>\n')

    for div in divs:
        fh.write(div)

    fh.write('</body></html>')


 if __name__ == '__main__':
    with open(sys.argv[1], 'rb') as fh:
        columns, responses = read_csv(fh)

    outfile = sys.argv[2]

    group_sizes = count_group_size(columns, responses)

    for group, count in sorted(group_sizes.items()):
        print('N=%d\t%s' % (count, group))

    by_question = counts_by_question(columns, responses)
    #by_question = reduce_answers(columns, by_question, group_sizes, ignore=True)
    by_group = counts_grouped_by_answer(columns, responses, TEAM_QUESTION)
    by_group_reduced = reduce_answers(columns, by_group, group_sizes, ignore=False)

    script, divs = plot_by_group(columns, by_group_reduced, group_sizes)

    with open(outfile, 'wb') as fh:
        write_html(fh, script, divs)
diff --git a/results.html b/results.html
 <html><head><title>Survey Results</title><link href="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.css" rel="stylesheet" type="text/css" /><script src="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.js"></script><script type="text/javascript">
    Bokeh.$(function() {
	#!/usr/bin/env python2.7

	import collections
	import csv
	import re
	import sys

	#import plotly.plotly as py
	#from plotly.graph_objs import *
	from bokeh.charts import Bar, output_file, show, save
	from bokeh.embed import components

	RE_QUESTION_CHOICE = re.compile('^([^\]]+)\[(.*)\]$')

	TEAM_QUESTION = 'What product do you work on the most?'
	MULTIPLE_CHOICE_QUESTIONS = {
	'What other languages do you work with?',
	'What products do you work on? (Check all that apply)',
	'What would improve your Treeherder experience?',
	'What are your thoughts on MQ?',
	'Which version control tools do you currently use for Mozilla projects?',
	'Which workflows do you practice to author commits with Mercurial? (Check all that apply)',
	'Why do you prefer Git over Mercurial? (check all that apply)',
	'Why do you prefer Mercurial over Git? (Check all that apply)',
	'If you use Git for working on Firefox, how do you obtain the source code? (Choose all that apply)',
	'What version control tools do you use to work with mozilla-central and other project branches (inbound, fx-team, etc)?',
	'How do you prefer to debug test failures?',
	'Which version control tool do you prefer?',
	}
	OPEN_TEXT_QUESTIONS = {
	'Please leave any other comments you may have on the Firefox build system and how improvements could affect your productivity.',
	'Other',
	'Please give more info in your answer(s) above',
	'What\'s your biggest complaint about Mercurial as a general tool?',
	'What\'s your biggest complaint about Mercurial at Mozilla?',
	'Other feedback about Mercurial',
	'What\'s your biggest complaint about Git?',
	'Other feedback about Git at Mozilla',
	'In your own words, feel free to tell us more about why you prefer Mercurial over Git.',
	'In your own words, feel free to tell us why you prefer Git over Mercurial.',
	'Before you go, what things did we fail to cover that would provide a boost to your productivity? Please be as specific as possible.',
	}

	PRODUCT_SHORTNAMES = {
	'Gecko / Platform': 'Platform',
	'Firefox for Desktop (Windows, OS X, Linux)': 'Fx-Team',
	'Firefox OS': 'FxOS',
	'Firefox for Android': 'Fennec',
	'Firefox for iOS': 'iOS',
	'Product Support (automation, tools, infrastructure, etc)': 'Support',
	'Other': 'Other',
	'All Other Responses': 'Other',
	}

	FILTER_PRODUCTS = {
	'Gecko / Platform',
	'Firefox for Desktop (Windows, OS X, Linux)',
	#'Firefox OS',
	#'Product Support (automation, tools, infrastructure, etc)',
	}

	SORTS = [
	[
	'No',
	'Yes',
	],
	[
	"This is my first job / I'm not sure",
	'Slower',
	'About the same',
	'Faster',
	],
	[
	'Not sure',
	'No more productive - already at peak productivity',
	'Up to 10% more productive',
	'Up to 25% more productive',
	'Up to 50% more productive',
	"100%+ more productive (you'd be a machine)",
	],
	[
	"N/A or I'm not sure",
	'Not at all satisfied',
	"It's not horrible, but it's still pretty bad",
	'It sucks a little',
	"It's neither bad nor good",
	"It's OK, I guess",
	"It's pretty good",
	"It's awesome!",
	],
	[
	'N/A',
	"It's gotten worse",
	'About the same',
	'Gotten better',
	],
	[
	'N/A',
	'1 = least impactful',
	'2',
	'3',
	'4 = most impactful',
	],
	[
	'N/A',
	'1 = little impact',
	'2',
	'3',
	'4 = most impact',
	],
	[
	'N/A',
	'1 = least impact',
	'2',
	'3',
	'4 = most impact',
	],
	[
	'1 - no impact',
	'2 - little impact',
	'3 - a fair amount of impact',
	'4 - a significant amount of impact',
	'5 - an extremely frustrating amount of impact',
	],
	[
	'N/A',
	'Keep about the same',
	'Increase investment',
	'Drastically increase investment',
	],
	[
	'N/A',
	'Extremely dissatisfied',
	'Below average',
	'Average',
	'Above average',
	"It's awesome!",
	],
	[
	'No impact / Not applicable',
	'A little impact',
	'Moderate impact',
	'Significant impact',
	'Tons of impact / Implement this ASAP',
	],
	[
	'No knowledge',
	'I know enough to do just the basics',
	'Fairly competent (I still get stuck from time to time)',
	'Highly competent',
	],
	[
	'N/A',
	'Once every few months',
	'Monthly',
	'Weekly',
	'Daily',
	],
	[
	'Never',
	'Over a year ago',
	'A few months ago',
	'A few weeks ago',
	'A few days ago',
	],
	[
	'N/A',
	'Not satisfied at all',
	'Below average',
	'Average',
	'Above average',
	"It's awesome!",
	],
	[
	'N/A (not relevant to me)',
	'Cut all investment',
	'Cut some investment',
	'Keep about the same',
	'Increase investment',
	'Drastically increase investment',
	],
	[
	'N/A',
	'Cut all investment',
	'Decrease investment',
	'Keep about the same',
	'Increase investment',
	'Drastically increase investment',
	],
	]


	def read_csv(fh):
	reader = csv.reader(fh)
	columns = reader.next()
	responses = list(reader)

	return columns, responses


	def iterate_answers(column, v):
	if column in MULTIPLE_CHOICE_QUESTIONS:
	for c in v.split(';'):
	c = c.strip()
	if c:
	yield c
	else:
	yield v


	def count_group_size(columns, responses):
	for i, column in enumerate(columns):
	if column != TEAM_QUESTION:
	continue

	counts = collections.Counter()
	for r in responses:
	counts[r[i]] += 1

	return counts


	def reduce_answers(columns, counts, group_sizes, ignore=False):
	"""Replace low frequency answers with "all other responses."""
	new_counts = {}
	for column in columns:
	if column not in counts:
	continue

	v = counts[column]
	answer_counts = collections.Counter()
	total = 0

	if isinstance(v, collections.defaultdict):
	for a, group_counts in v.items():
	for group, c in group_counts.items():
	if group not in FILTER_PRODUCTS:
	continue
	answer_counts[a] += c
	total += c

	if column in MULTIPLE_CHOICE_QUESTIONS:
	total = 0
	for group, c in group_sizes.items():
	if group in FILTER_PRODUCTS:
	total += c
	else:
	for a, c in v.items():
	answer_counts[a] += c
	total += c

	to_filter = set()
	for a, c in answer_counts.items():
	if float(c) / float(total) <= 0.05:
	to_filter.add(a)
	if c <= 2:
	to_filter.add(a)

	if isinstance(v, collections.defaultdict):
	new_counts[column] = collections.defaultdict(collections.Counter)
	for a, group_counts in v.items():
	if a in to_filter:
	if ignore:
	continue
	a = 'All Other Responses'

	for group, c in group_counts.items():
	if group not in FILTER_PRODUCTS:
	continue
	new_counts[column][a][group] += c

	else:
	new_counts[column] = collections.Counter()
	for a, c in v.items():
	if a in to_filter:
	if ignore:
	continue
	a = 'All Other Responses'

	new_counts[column][a] += c

	return new_counts


	def counts_by_question(columns, responses):
	counts = {}

	for i, column in enumerate(columns):
	if i < 2:
	continue

	if column in OPEN_TEXT_QUESTIONS:
	continue

	counts[column] = collections.Counter()

	for r in responses:
	v = r[i]
	if not v:
	continue

	for a in iterate_answers(column, v):
	counts[column][a] += 1

	return counts

	def counts_grouped_by_answer(columns, responses, question):
	group_index = columns.index(question)

	counts = {}

	for i, column in enumerate(columns):
	if i < 2:
	continue

	if i == group_index:
	continue

	if column in OPEN_TEXT_QUESTIONS:
	continue

	counts[column] = collections.defaultdict(collections.Counter)

	for r in responses:
	v = r[i]
	if not v:
	continue

	group = r[group_index]
	for a in iterate_answers(column, v):
	counts[column][a][group] += 1

	return counts


	def sort_answers(answers):
	"""Given an iterable of answers, sort according to proper order."""
	srtd = None
	for l in SORTS:
	relevant = set(a for a in answers if a != 'All Other Responses')
	if all(a in l for a in relevant):
	srtd = list(l)
	if 'All Other Responses' in answers:
	srtd.insert(0, 'All Other Responses')
	break

	if not srtd:
	return sorted(answers)

	return [a for a in srtd if a in answers]


	def plot_by_group(columns, answers, group_sizes):
	plots = []

	for i, column in enumerate(columns):
	if column not in answers:
	continue

	all_answers = set(answers[column].keys())
	counts_by_group = collections.Counter()
	for counts in answers[column].values():
	for group, count in counts.items():
	counts_by_group[group] += count

	bars = []
	x = sort_answers(all_answers)
	data = {}

	for group in sorted(counts_by_group):
	y = []

	for answer in x:
	counts = answers[column][answer]
	if group not in counts:
	y.append(0)
	else:
	percent = float(counts[group]) / float(group_sizes[group])
	y.append(int(percent * 100.0))

	data[PRODUCT_SHORTNAMES[group]] = y

	b = Bar(data,
	cat=x,
	title=column,
	ylabel='Percentage',
	width=1280,
	height=560,
	legend=True,
	tools='',
	)
	b.title_text_font_size = '13px'
	plots.append(b)

	script, divs = components(plots)
	return script, divs


	def write_html(fh, script, divs):
	fh.write('<html><head><title>Survey Results</title>')
	fh.write('<link href="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.css" rel="stylesheet" type="text/css" />')
	fh.write('<script src="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.js"></script>')
	fh.write(script)
	fh.write('</head><body>\n')

	for div in divs:
	fh.write(div)

	fh.write('</body></html>')


	if __name__ == '__main__':
	with open(sys.argv[1], 'rb') as fh:
	columns, responses = read_csv(fh)

	outfile = sys.argv[2]

	group_sizes = count_group_size(columns, responses)

	for group, count in sorted(group_sizes.items()):
	print('N=%d\t%s' % (count, group))

	by_question = counts_by_question(columns, responses)
	#by_question = reduce_answers(columns, by_question, group_sizes, ignore=True)
	by_group = counts_grouped_by_answer(columns, responses, TEAM_QUESTION)
	by_group_reduced = reduce_answers(columns, by_group, group_sizes, ignore=False)

	script, divs = plot_by_group(columns, by_group_reduced, group_sizes)

	with open(outfile, 'wb') as fh:
	write_html(fh, script, divs)
	<html><head><title>Survey Results</title><link href="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.css" rel="stylesheet" type="text/css" /><script src="https://people.mozilla.org/~gszorc/bokeh-0.9.3.min.js"></script><script type="text/javascript">
	Bokeh.$(function() {