Created
March 27, 2016 12:56
-
-
Save chribsen/99a4e1dadbb73ebd5631 to your computer and use it in GitHub Desktop.
Computes the jaccard similarity of the places visited by each user for all user pairs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2 | |
from scipy.spatial.distance import jaccard | |
conn_dtu = psycopg2.connect(<connstring>) | |
cur_dtu = conn_dtu.cursor() | |
# Retrieve places visited for each user in the user pair and aggregate them into an array. | |
cur_dtu.execute(""" | |
select count(*) FROM (select user_a, ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_a) as places_a, | |
user_b, ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_b) as places_b from derived_friend_features as dff | |
where ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_a) is not null | |
and ( SELECT array_agg(place_id) FROM derived_places_visited WHERE user_id=dff.user_b) is not null) as d | |
""") | |
# Iterate over all the user pairs and compute the Jaccard similarity of the places that | |
# they have visited. | |
for i, (user_a, places_a, user_b, places_b) in enumerate(cur_dtu.fetchall()): | |
# Distance metric: http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.jaccard.html | |
# jaccard() returns a dissimilarity, thus we need to substract by 1 to get the similarity. | |
jac_similarity = 1 - jaccard(places_a, places_b) | |
# Update the feature | |
cur_dtu.execute("""UPDATE derived_friend_features SET places_jac_similarity=%s WHERE user_a=%s AND user_b=%s""", | |
(jac_similarity, user_a, user_b,)) | |
places_visited_together = list(set(places_a).intersection(set(places_b))) | |
# Insert the places visited together: Just in case we need this later. | |
cur_dtu.execute("""INSERT INTO tmp_places_visited_together (user_a, user_b, place_ids) VALUES (%s,%s,%s)""", | |
(user_a, user_b, places_visited_together)) | |
if i % 500 == 0: | |
print('Saving...') | |
conn_dtu.commit() | |
conn_dtu.commit() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment