Created
March 7, 2019 13:47
-
-
Save almugabo/44d9e3d9d9ff402408d8924dbd832270 to your computer and use it in GitHub Desktop.
Metapaths2vec utilities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# March 7th, 2019 (by mm) | |
## Generating paths (walks) | |
## inspired by the original py4genMetaPaths.py from | |
## but also : by stellargraph | |
## TO DO : create tests | |
import pandas as pd | |
import random | |
import time | |
### IMPORTANT : DATA FORMAT | |
### Graph Data are saved in a single table of edges | |
## the mandatory fields are : | |
## node_source | |
## node_target | |
## edge_path (this specifieds teh node types of the sorce / target and is of format : 'node_type'_'node_type' example : author_paper | |
## edge_type : this is a field which can help make distinction between edges with same noed types but different relation | |
## edge_type | |
# AND node_ids have to be unique per node | |
## ---------------------------------------- | |
def create_node_dictionary(xDF): | |
'''' | |
the function create a dictionary with information about the nodes | |
it has following format: | |
xnode_id: {node_type: xxx, node_neighbours: []}} | |
Input: | |
xDF : the dataframe as described above | |
Output: | |
a dictionary with node information | |
## TO DO LATER: | |
- one can stream it froma database or a text file | |
- to reduce memory footprint , it can operate with index (integer) of ids | |
- both for the keys and the neighbours | |
https://stackoverflow.com/questions/10264874 | |
-GENERALLY : | |
one can have the lifting done in a database back end | |
e.g: | |
- creation of dictionary of nodes | |
- using indices (int) rather than string | |
''' | |
# the dict | |
xdict_nodes = {} | |
# we need to get the type of nodes | |
xlst_node_type = list(xDF.edge_path.str.split('_')) | |
xset_node_source = set(list(zip(list(xDF['node_source']) , list((x[0] for x in xlst_node_type ))))) | |
xset_node_target = set(list(zip(list(xDF['node_target']) , list((x[1] for x in xlst_node_type ))))) | |
xset_nodes = xset_node_source | xset_node_target | |
for xTup in xset_nodes: | |
xdict_nodes[xTup[0]] = {} | |
xdict_nodes[xTup[0]]['node_type'] = xTup[1] | |
xdict_nodes[xTup[0]]['neighbours'] = [] | |
###add neighbours | |
xdf_split = xDF.groupby('node_source') | |
for xgroup in xdf_split: | |
#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target']) | |
#xnode_id = | |
lst_neigbours = list(xgroup[1]['node_target']) | |
xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours | |
# !!!! same for node_target | |
xdf_split = xDF.groupby('node_target') | |
for xgroup in xdf_split: | |
#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target']) | |
#xnode_id = | |
lst_neigbours = list(xgroup[1]['node_source']) | |
xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours | |
return xdict_nodes | |
def create_walks(xdict_nodes, | |
xmetapaths, | |
xwalk_length, | |
xwalk_number, | |
xoutfile_name, | |
xrandom_seed = None): | |
''' | |
# create walks according to metapaths specified by as lists | |
Input: | |
xdict_nodes : a dictionary with node data see previous function | |
xmetapaths : in format [['paper','author'], ['author', 'paper']] | |
xwalk_length | |
xwalk_number, | |
xoutfile_name | |
xrandom_seed = None | |
OUtput: | |
a text file with the walks | |
''' | |
# initialize random seed | |
if xrandom_seed: | |
x_rs = random.Random(xrandom_seed) | |
else: | |
x_rs = random.Random() | |
# Open file and close it | |
outfile = open(xoutfile_name , 'w') | |
outfile.close() | |
# do the walks and save the walks ina file | |
with open(xoutfile_name, 'a') as ff: | |
for xnode in xdict_nodes: | |
# node type | |
xnode_type = xdict_nodes[xnode]['node_type'] | |
# from the given metapaths, retain only metapath | |
# for which the this node type is the beginning | |
xmetapaths_filtered = [xmetapath for xmetapath in xmetapaths if xmetapath[0] == xnode_type] | |
## | |
for xmetapath in xmetapaths_filtered: | |
# the following is used just for filtering of node types later | |
xmetapath = xmetapath[1:] * ((xwalk_length // (len(xmetapath)-1)) + 1) | |
# now starts the walks | |
for _ in range(xwalk_number ): | |
xwalk_for_node = [] | |
xcurrent_node = xnode | |
for xnr in range(xwalk_length): | |
# add the current node | |
xwalk_for_node.append(xcurrent_node) | |
# get the neighbours | |
xlst_neighbours = xdict_nodes[xnode]['neighbours'] | |
# filter neighbours of the type as stated in metapath | |
xlst_neighbours = [xnode for | |
xnode in xlst_neighbours | |
if xdict_nodes[xnode]['node_type'] == xmetapath[xnr] ] | |
if len(xlst_neighbours) == 0: | |
# if there are no neigbours of the type as required by metapath | |
# then stop | |
break | |
# update the current node as a randomly chosen node from neigbours | |
# will be next in the random walk | |
xcurrent_node = x_rs.choice(xlst_neighbours) | |
# we need now to write the walks in the file | |
if len(xwalk_for_node) > 0: | |
xwalk_for_node_str = ' '.join(xwalk_for_node) + '\n' | |
ff.write(xwalk_for_node_str) | |
print('all walks written') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment