Skip to content

Instantly share code, notes, and snippets.

@almugabo
Created March 7, 2019 13:47
Show Gist options
  • Save almugabo/44d9e3d9d9ff402408d8924dbd832270 to your computer and use it in GitHub Desktop.
Save almugabo/44d9e3d9d9ff402408d8924dbd832270 to your computer and use it in GitHub Desktop.
Metapaths2vec utilities
#!/usr/bin/env python
# coding: utf-8
# March 7th, 2019 (by mm)
## Generating paths (walks)
## inspired by the original py4genMetaPaths.py from
## but also : by stellargraph
## TO DO : create tests
import pandas as pd
import random
import time
### IMPORTANT : DATA FORMAT
### Graph Data are saved in a single table of edges
## the mandatory fields are :
## node_source
## node_target
## edge_path (this specifieds teh node types of the sorce / target and is of format : 'node_type'_'node_type' example : author_paper
## edge_type : this is a field which can help make distinction between edges with same noed types but different relation
## edge_type
# AND node_ids have to be unique per node
## ----------------------------------------
def create_node_dictionary(xDF):
''''
the function create a dictionary with information about the nodes
it has following format:
xnode_id: {node_type: xxx, node_neighbours: []}}
Input:
xDF : the dataframe as described above
Output:
a dictionary with node information
## TO DO LATER:
- one can stream it froma database or a text file
- to reduce memory footprint , it can operate with index (integer) of ids
- both for the keys and the neighbours
https://stackoverflow.com/questions/10264874
-GENERALLY :
one can have the lifting done in a database back end
e.g:
- creation of dictionary of nodes
- using indices (int) rather than string
'''
# the dict
xdict_nodes = {}
# we need to get the type of nodes
xlst_node_type = list(xDF.edge_path.str.split('_'))
xset_node_source = set(list(zip(list(xDF['node_source']) , list((x[0] for x in xlst_node_type )))))
xset_node_target = set(list(zip(list(xDF['node_target']) , list((x[1] for x in xlst_node_type )))))
xset_nodes = xset_node_source | xset_node_target
for xTup in xset_nodes:
xdict_nodes[xTup[0]] = {}
xdict_nodes[xTup[0]]['node_type'] = xTup[1]
xdict_nodes[xTup[0]]['neighbours'] = []
###add neighbours
xdf_split = xDF.groupby('node_source')
for xgroup in xdf_split:
#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target'])
#xnode_id =
lst_neigbours = list(xgroup[1]['node_target'])
xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours
# !!!! same for node_target
xdf_split = xDF.groupby('node_target')
for xgroup in xdf_split:
#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target'])
#xnode_id =
lst_neigbours = list(xgroup[1]['node_source'])
xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours
return xdict_nodes
def create_walks(xdict_nodes,
xmetapaths,
xwalk_length,
xwalk_number,
xoutfile_name,
xrandom_seed = None):
'''
# create walks according to metapaths specified by as lists
Input:
xdict_nodes : a dictionary with node data see previous function
xmetapaths : in format [['paper','author'], ['author', 'paper']]
xwalk_length
xwalk_number,
xoutfile_name
xrandom_seed = None
OUtput:
a text file with the walks
'''
# initialize random seed
if xrandom_seed:
x_rs = random.Random(xrandom_seed)
else:
x_rs = random.Random()
# Open file and close it
outfile = open(xoutfile_name , 'w')
outfile.close()
# do the walks and save the walks ina file
with open(xoutfile_name, 'a') as ff:
for xnode in xdict_nodes:
# node type
xnode_type = xdict_nodes[xnode]['node_type']
# from the given metapaths, retain only metapath
# for which the this node type is the beginning
xmetapaths_filtered = [xmetapath for xmetapath in xmetapaths if xmetapath[0] == xnode_type]
##
for xmetapath in xmetapaths_filtered:
# the following is used just for filtering of node types later
xmetapath = xmetapath[1:] * ((xwalk_length // (len(xmetapath)-1)) + 1)
# now starts the walks
for _ in range(xwalk_number ):
xwalk_for_node = []
xcurrent_node = xnode
for xnr in range(xwalk_length):
# add the current node
xwalk_for_node.append(xcurrent_node)
# get the neighbours
xlst_neighbours = xdict_nodes[xnode]['neighbours']
# filter neighbours of the type as stated in metapath
xlst_neighbours = [xnode for
xnode in xlst_neighbours
if xdict_nodes[xnode]['node_type'] == xmetapath[xnr] ]
if len(xlst_neighbours) == 0:
# if there are no neigbours of the type as required by metapath
# then stop
break
# update the current node as a randomly chosen node from neigbours
# will be next in the random walk
xcurrent_node = x_rs.choice(xlst_neighbours)
# we need now to write the walks in the file
if len(xwalk_for_node) > 0:
xwalk_for_node_str = ' '.join(xwalk_for_node) + '\n'
ff.write(xwalk_for_node_str)
print('all walks written')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment