almugabo · March 7, 2019 13:47
diff --git a/genMetaPaths.py b/genMetaPaths.py
 #!/usr/bin/env python
 # coding: utf-8
 # March 7th, 2019 (by mm)
 ##  Generating paths (walks)
 ## inspired by the original py4genMetaPaths.py from 
 ## but also : by stellargraph 
 ## TO DO : create tests 

 import pandas as pd 
 import random 
 import time 


 ### IMPORTANT : DATA FORMAT 
 ### Graph Data are saved in a single table of edges 
 ## the mandatory fields are : 
 ## node_source
 ## node_target 
 ## edge_path (this specifieds teh node types of the sorce / target and is of format : 'node_type'_'node_type' example : author_paper 
 ## edge_type : this is a field which can help make distinction between edges with same noed types but different relation 
 ## edge_type  
 # AND node_ids have to be unique per node 
 ## ----------------------------------------

 def create_node_dictionary(xDF):
 	''''
 	the function create a dictionary with information about the nodes 
 	it has following format: 
 	xnode_id: {node_type: xxx, node_neighbours: []}}
 	Input:
 	xDF : the dataframe as described above 
 	Output:
 	a dictionary with node information 
 	## TO DO LATER: 
 	- one can stream it froma database or a text file 
 	- to reduce memory footprint , it can operate with index (integer) of ids 
 	- both for the keys and the neighbours
 	https://stackoverflow.com/questions/10264874
 	-GENERALLY :
 	one can have the lifting done in a database back end 
 	e.g: 
 	- creation of dictionary of nodes 
 	- using indices (int) rather than string 	
 	'''
 	# the dict 
    xdict_nodes = {}
 	
 	# we need to get the type of nodes 
 	
    xlst_node_type = list(xDF.edge_path.str.split('_'))

    xset_node_source = set(list(zip(list(xDF['node_source']) , list((x[0] for x in xlst_node_type )))))
    xset_node_target = set(list(zip(list(xDF['node_target']) , list((x[1] for x in xlst_node_type )))))

    xset_nodes = xset_node_source | xset_node_target

    for xTup in xset_nodes:
        xdict_nodes[xTup[0]] = {}
        xdict_nodes[xTup[0]]['node_type'] = xTup[1]
        xdict_nodes[xTup[0]]['neighbours'] = []

    ###add neighbours 
    xdf_split = xDF.groupby('node_source')
    for xgroup in xdf_split:
        #xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target']) 
        #xnode_id = 
        lst_neigbours = list(xgroup[1]['node_target']) 
        xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours 

    # !!!! same for node_target
    xdf_split = xDF.groupby('node_target')
    for xgroup in xdf_split:
        #xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target']) 
        #xnode_id = 
        lst_neigbours = list(xgroup[1]['node_source']) 
        xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours 

    return xdict_nodes  
    

 def create_walks(xdict_nodes,
                 xmetapaths,
                 xwalk_length, 
                 xwalk_number,
                 xoutfile_name,
                 xrandom_seed = None):
   '''					 
    
    # create walks according to metapaths specified by as lists 
    Input:
    xdict_nodes : a dictionary with node data see previous function 
    xmetapaths : in format [['paper','author'], ['author', 'paper']]
    xwalk_length 
    xwalk_number,
    xoutfile_name
    xrandom_seed = None
    OUtput: 
    a text file with the walks 
    '''
    
    # initialize random seed 
    if xrandom_seed:
        x_rs = random.Random(xrandom_seed)
    else:
        x_rs = random.Random()
        
    # Open file and close it 
    outfile = open(xoutfile_name , 'w')
    outfile.close() 
    # do the walks and save the walks ina  file 
    with open(xoutfile_name, 'a') as ff:
        for xnode in xdict_nodes:
            # node type 
            xnode_type = xdict_nodes[xnode]['node_type']
            # from the given metapaths, retain only metapath 
            # for which the this node type is the beginning 
            xmetapaths_filtered = [xmetapath for xmetapath in xmetapaths if xmetapath[0] == xnode_type]
            ## 
            for xmetapath in  xmetapaths_filtered:
                # the following is used just for filtering of node types later 
                xmetapath = xmetapath[1:] * ((xwalk_length // (len(xmetapath)-1)) + 1)
                # now starts the walks 

                for _ in range(xwalk_number ):
                    xwalk_for_node = []
                    xcurrent_node = xnode
                    for xnr in range(xwalk_length):
                        # add the current node 
                        xwalk_for_node.append(xcurrent_node)
                        # get the neighbours
                        xlst_neighbours = xdict_nodes[xnode]['neighbours']
                        # filter neighbours of the type as stated in metapath 
                        xlst_neighbours = [xnode for 
                                           xnode in xlst_neighbours 
                                           if xdict_nodes[xnode]['node_type'] == xmetapath[xnr] ]

                        if len(xlst_neighbours) == 0:
                            # if there are no neigbours of the type as required by metapath 
                            # then stop 
                            break 
                        # update the current node as a randomly chosen node from neigbours
                        # will be next in the random walk
                        xcurrent_node = x_rs.choice(xlst_neighbours)
                    # we need now to write the walks in the file 
                    if len(xwalk_for_node) > 0:
                        xwalk_for_node_str = ' '.join(xwalk_for_node) + '\n'
                        ff.write(xwalk_for_node_str)
    print('all walks written')
	#!/usr/bin/env python
	# coding: utf-8
	# March 7th, 2019 (by mm)
	## Generating paths (walks)
	## inspired by the original py4genMetaPaths.py from
	## but also : by stellargraph
	## TO DO : create tests

	import pandas as pd
	import random
	import time


	### IMPORTANT : DATA FORMAT
	### Graph Data are saved in a single table of edges
	## the mandatory fields are :
	## node_source
	## node_target
	## edge_path (this specifieds teh node types of the sorce / target and is of format : 'node_type'_'node_type' example : author_paper
	## edge_type : this is a field which can help make distinction between edges with same noed types but different relation
	## edge_type
	# AND node_ids have to be unique per node
	## ----------------------------------------

	def create_node_dictionary(xDF):
	''''
	the function create a dictionary with information about the nodes
	it has following format:
	xnode_id: {node_type: xxx, node_neighbours: []}}
	Input:
	xDF : the dataframe as described above
	Output:
	a dictionary with node information
	## TO DO LATER:
	- one can stream it froma database or a text file
	- to reduce memory footprint , it can operate with index (integer) of ids
	- both for the keys and the neighbours
	https://stackoverflow.com/questions/10264874
	-GENERALLY :
	one can have the lifting done in a database back end
	e.g:
	- creation of dictionary of nodes
	- using indices (int) rather than string
	'''
	# the dict
	xdict_nodes = {}

	# we need to get the type of nodes

	xlst_node_type = list(xDF.edge_path.str.split('_'))

	xset_node_source = set(list(zip(list(xDF['node_source']) , list((x[0] for x in xlst_node_type )))))
	xset_node_target = set(list(zip(list(xDF['node_target']) , list((x[1] for x in xlst_node_type )))))

	xset_nodes = xset_node_source \| xset_node_target

	for xTup in xset_nodes:
	xdict_nodes[xTup[0]] = {}
	xdict_nodes[xTup[0]]['node_type'] = xTup[1]
	xdict_nodes[xTup[0]]['neighbours'] = []

	###add neighbours
	xdf_split = xDF.groupby('node_source')
	for xgroup in xdf_split:
	#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target'])
	#xnode_id =
	lst_neigbours = list(xgroup[1]['node_target'])
	xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours

	# !!!! same for node_target
	xdf_split = xDF.groupby('node_target')
	for xgroup in xdf_split:
	#xdict_adj_lists[x_path][xgroup[0]] = list(xgroup[1]['node_target'])
	#xnode_id =
	lst_neigbours = list(xgroup[1]['node_source'])
	xdict_nodes[xgroup[0]]['neighbours'] = xdict_nodes[xgroup[0]]['neighbours'] + lst_neigbours

	return xdict_nodes


	def create_walks(xdict_nodes,
	xmetapaths,
	xwalk_length,
	xwalk_number,
	xoutfile_name,
	xrandom_seed = None):
	'''

	# create walks according to metapaths specified by as lists
	Input:
	xdict_nodes : a dictionary with node data see previous function
	xmetapaths : in format [['paper','author'], ['author', 'paper']]
	xwalk_length
	xwalk_number,
	xoutfile_name
	xrandom_seed = None
	OUtput:
	a text file with the walks
	'''

	# initialize random seed
	if xrandom_seed:
	x_rs = random.Random(xrandom_seed)
	else:
	x_rs = random.Random()

	# Open file and close it
	outfile = open(xoutfile_name , 'w')
	outfile.close()
	# do the walks and save the walks ina file
	with open(xoutfile_name, 'a') as ff:
	for xnode in xdict_nodes:
	# node type
	xnode_type = xdict_nodes[xnode]['node_type']
	# from the given metapaths, retain only metapath
	# for which the this node type is the beginning
	xmetapaths_filtered = [xmetapath for xmetapath in xmetapaths if xmetapath[0] == xnode_type]
	##
	for xmetapath in xmetapaths_filtered:
	# the following is used just for filtering of node types later
	xmetapath = xmetapath[1:] * ((xwalk_length // (len(xmetapath)-1)) + 1)
	# now starts the walks

	for _ in range(xwalk_number ):
	xwalk_for_node = []
	xcurrent_node = xnode
	for xnr in range(xwalk_length):
	# add the current node
	xwalk_for_node.append(xcurrent_node)
	# get the neighbours
	xlst_neighbours = xdict_nodes[xnode]['neighbours']
	# filter neighbours of the type as stated in metapath
	xlst_neighbours = [xnode for
	xnode in xlst_neighbours
	if xdict_nodes[xnode]['node_type'] == xmetapath[xnr] ]

	if len(xlst_neighbours) == 0:
	# if there are no neigbours of the type as required by metapath
	# then stop
	break
	# update the current node as a randomly chosen node from neigbours
	# will be next in the random walk
	xcurrent_node = x_rs.choice(xlst_neighbours)
	# we need now to write the walks in the file
	if len(xwalk_for_node) > 0:
	xwalk_for_node_str = ' '.join(xwalk_for_node) + '\n'
	ff.write(xwalk_for_node_str)
	print('all walks written')