Created
January 17, 2020 06:33
-
-
Save rich-hart/ea4b56b4dea453f907abb47c98196473 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from networkx.algorithms.components import connected_components | |
import networkx as nx | |
import random | |
from fuzzywuzzy import fuzz | |
import numpy as np | |
import csv | |
RANDOMIZE = False | |
#import ipdb; ipdb.set_trace() | |
with open('responces.csv', newline='\n') as csvfile: | |
#answer_clusters = {} | |
responce_reader = csv.reader(csvfile, delimiter=',', quotechar='"') | |
headers = next(responce_reader) | |
responces = { h :[] for h in headers} | |
for row in responce_reader: | |
for i in range(len(headers)): | |
header = headers[i] | |
item = row[i] | |
item = item.strip().lower() | |
if item: | |
responces[header].append(item) | |
# answers = [ a.lower().strip() for r in answer_reader for a in r if a] | |
collated_answers = {} | |
for question, answers in responces.items(): | |
ratio_matrix = np.zeros((len(answers),len(answers))) | |
final_tally = { a:0 for a in answers} | |
skip_ans = set() | |
skip_index = set() | |
for i, ans_a in enumerate(answers): | |
for j, ans_b in enumerate(answers): | |
ans_b = answers[j] | |
ratio = fuzz.ratio(ans_a,ans_b) | |
ratio_matrix[i,j] = ratio | |
if ratio > 60: | |
#if ans_a not in final_tally: | |
final_tally[ans_a] += 1 | |
#else: | |
#final_tally[ans_a] += 1 | |
skip_index.add(j) | |
skip_ans.add(ans_b) | |
break | |
graph_matrix = (ratio_matrix > 70) * 1 | |
#G = nx.Graph() | |
G = nx.to_networkx_graph(graph_matrix, create_using=nx.Graph) | |
#print(final_tally) | |
compontents = [ a for a in connected_components(G)] | |
clusters = {} | |
for comp in compontents: | |
for node in comp: | |
answer = answers[node] | |
clusters[answer] = len(comp) | |
#clusters[answer] = int(len(comp) *100/ len(G.nodes)) | |
break | |
#pass | |
clusters_str = '\n' | |
clusters = list(clusters.items()) | |
clusters.sort(key=lambda x:x[1],reverse=True) | |
#clusters = clusters[:12] | |
#new_total = sum([c[1] for c in clusters]) | |
#clusters = [(c[0],int(c[1]*100/new_total)) for c in clusters] | |
collated_answers[question]=clusters | |
for k,v in clusters: | |
row_str = f"{k}\t\t{v}\n" | |
clusters_str = clusters_str + row_str | |
print(question) | |
print(clusters_str) | |
import ipdb; ipdb.set_trace() | |
with open('collated_answers.csv','w+', newline='\n') as csvfile: | |
writer = csv.writer(csvfile, delimiter=',') | |
for question, answers in collated_answers.items(): | |
writer.writerow(['*****']) | |
writer.writerow([question]) | |
for answer, count in answers: | |
writer.writerow([answer,count]) | |
# answers_writer = csv.DictWriter(csvfile,fieldnames=collated_answers.keys()) | |
# answers_writer.writeheader() | |
# for q,a in collated_answers: | |
#keys = final_tally.keys() | |
#final_tally = [ (k,v) for k,v in final_tally.items()] | |
##final_tally.sort(key = lambda x: x[1]) | |
#final_tally.sort(key = lambda x: x[1],reverse=True) | |
##final_tally = final_tally[:8] | |
##distinct_ratios = ratio_matrix - np.identity(len(answers))*100 | |
#if RANDOMIZE: | |
# counts = np.array([x[1] + random.randint(0,5) for x in final_tally]) | |
#else: | |
# counts = np.array([x[1] + random.randint(0,4) for x in final_tally]) | |
##counts = np.array([x[1] for x in final_tally]) | |
#total = sum(counts) | |
#percent = [c * 100 / total for c in counts] | |
# | |
#sums = sum(ratio_matrix > 60) | |
##answer_scores = list(zip(keys,percent)) | |
##answer_scores.sort(key = lambda x: x[1],reverse=True) | |
##print(answer_scores) | |
#final_counts = [ a for a in zip(answers,sums) if a[1]] | |
#final_counts.sort(key=lambda x: x[1],reverse=True) | |
#final_counts = final_counts[:12] | |
#if RANDOMIZE: | |
# final_counts = [(x[0],x[1] + random.randint(0,5)) for x in final_counts] | |
#else: | |
# final_counts = [(x[0],x[1]) for x in final_counts] | |
# | |
#total = sum([x[1] for x in final_counts]) | |
# | |
#final_counts = [(x[0],int(x[1]*100/total)) for x in final_counts] | |
# | |
#final_counts = dict(final_counts) | |
# | |
#print(final_counts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment