Last active
December 10, 2023 01:11
-
-
Save shawngraham/8b099ae4992a57718cf66e2c3166a16d to your computer and use it in GitHub Desktop.
comparing the offshoreleaks database (https://offshoreleaks.icij.org/pages/database) to augment my knowledge-graph.csv. Blog post: https://carleton.ca/xlab/2023/trawling-data-the-panama-papers-and-our-knowledge-graph/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
# Initialize the argument parser | |
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.') | |
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.') | |
parser.add_argument('second_csv', help='The CSV file to compare against.') | |
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".') | |
# Parse the command-line arguments | |
args = parser.parse_args() | |
# Load source and target entities from the first CSV | |
entities_set = set() | |
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file: | |
csv_reader = csv.DictReader(file) | |
for row in csv_reader: | |
entities_set.add(row['source'].strip()) | |
entities_set.add(row['target'].strip()) | |
# Read the second CSV and append matches to the output CSV | |
match_count = 0 # Counter for matches | |
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \ | |
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file: | |
csv_reader = csv.DictReader(read_file) | |
# Check if the output csv file is empty, to write header or not | |
write_file.seek(0, 2) # Go to the end of the file. | |
if write_file.tell() == 0: # Check if it's empty. | |
write_header = True | |
else: | |
write_header = False | |
write_file.seek(0) # Go back to the beginning of the file. | |
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames) | |
# If it's the first time writing to the file, write the header | |
if write_header: | |
csv_writer.writeheader() | |
# Check each row in the second CSV for matches and append to the output CSV if matched | |
for row in csv_reader: | |
if any(row['name'].strip() in entities_set for field in row): | |
csv_writer.writerow(row) | |
match_count += 1 | |
if match_count == 0: | |
print("No matches were found.") | |
else: | |
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
from rapidfuzz import process, fuzz | |
# Initialize the argument parser | |
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.') | |
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.') | |
parser.add_argument('second_csv', help='The CSV file to compare against.') | |
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows.') | |
parser.add_argument('--score_threshold', type=int, default=90, help='The match score threshold for fuzzy matching.') | |
# Parse the command-line arguments | |
args = parser.parse_args() | |
# Load source and target entities from the first CSV | |
entities_set = set() | |
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file: | |
csv_reader = csv.DictReader(file) | |
for row in csv_reader: | |
entities_set.add(row['source'].strip()) | |
entities_set.add(row['target'].strip()) | |
# Read the second CSV and append matches to the output CSV | |
match_count = 0 # Counter for matches | |
# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched | |
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \ | |
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file: | |
csv_reader = csv.DictReader(read_file) | |
# Extend the fieldnames to include the matched_name field | |
fieldnames = csv_reader.fieldnames + ['matched_name'] | |
csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames) | |
# If it's the first time writing to the file, write the header | |
write_file.seek(0, 2) # Go to the end of file | |
if write_file.tell() == 0: | |
csv_writer.writeheader() | |
# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched | |
for row in csv_reader: | |
name_to_match = row['name'].strip() | |
result = process.extractOne(name_to_match, entities_set, scorer=fuzz.WRatio, score_cutoff=args.score_threshold) | |
# Check if a match was found and unpack the result (which includes the match, score, and index) | |
if result: | |
best_match, score, _ = result | |
# Write the original row to the new file, appending the matched_name | |
row['matched_name'] = best_match | |
csv_writer.writerow(row) | |
match_count += 1 | |
if match_count == 0: | |
print("No matches were found.") | |
else: | |
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
# File paths for your CSV files | |
matched_rows_path = 'matched_rows.csv' | |
relationships_path = 'orig_data/relationships.csv' | |
output_relationships_path = 'output_relationships.csv' | |
# Load the node_id values from matched_rows.csv | |
node_ids_set = set() | |
with open(matched_rows_path, mode='r', newline='', encoding='utf-8') as file: | |
csv_reader = csv.DictReader(file) | |
for row in csv_reader: | |
node_ids_set.add(row['node_id'].strip()) | |
# Read the relationships.csv and write matches to a new CSV | |
match_count = 0 # Counter for matches | |
with open(relationships_path, mode='r', newline='', encoding='utf-8') as read_file, \ | |
open(output_relationships_path, mode='w', newline='', encoding='utf-8') as write_file: | |
csv_reader = csv.DictReader(read_file) | |
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames) | |
# Write the header to the output file | |
csv_writer.writeheader() | |
# Check each row for matching node_id_start or node_id_end and write to the new file if any match is found | |
for row in csv_reader: | |
if row['node_id_start'].strip() in node_ids_set or row['node_id_end'].strip() in node_ids_set: | |
csv_writer.writerow(row) | |
match_count += 1 | |
if match_count == 0: | |
print("No matching relationships were found.") | |
else: | |
print(f"{match_count} matching {'relationship' if match_count == 1 else 'relationships'} have been written to the output CSV file.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
# Initialize the argument parser | |
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.') | |
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.') | |
parser.add_argument('second_csv', help='The CSV file to compare against.') | |
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".') | |
# Parse the command-line arguments | |
args = parser.parse_args() | |
# Load source and target entities from the first CSV | |
entities_set = set() | |
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file: | |
csv_reader = csv.DictReader(file) | |
for row in csv_reader: | |
entities_set.add(row['source'].strip()) | |
entities_set.add(row['target'].strip()) | |
print(entities_set) | |
# Read the second CSV and append matches to the output CSV | |
match_count = 0 # Counter for matches | |
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \ | |
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file: | |
csv_reader = csv.DictReader(read_file) | |
# Check if the output csv file is empty, to write header or not | |
write_file.seek(0, 2) # Go to the end of the file. | |
if write_file.tell() == 0: # Check if it's empty. | |
write_header = True | |
else: | |
write_header = False | |
write_file.seek(0) # Go back to the beginning of the file. | |
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames) | |
# If it's the first time writing to the file, write the header | |
if write_header: | |
csv_writer.writeheader() | |
# Check each row in the second CSV for matches and append to the output CSV if matched | |
for row in csv_reader: | |
if any(row['node_id'].strip() in entities_set for field in row): | |
csv_writer.writerow(row) | |
match_count += 1 | |
if match_count == 0: | |
print("No matches were found.") | |
else: | |
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
# Set up the argument parser | |
parser = argparse.ArgumentParser(description='Replace node IDs with their corresponding names.') | |
parser.add_argument('csv1', help='The CSV file with "source" and "target" columns, containing numeric node identifiers.') | |
parser.add_argument('csv2', help='The CSV file with "node_id" and "name" columns, for mapping identifiers to names.') | |
parser.add_argument('--output_csv', default='named_relationships.csv', help='The output CSV file with "source" and "target" replaced by names.') | |
# Parse the command-line arguments | |
args = parser.parse_args() | |
# Build the mapping from node_id to name from csv2 | |
node_id_to_name = {} | |
with open(args.csv2, mode='r', newline='', encoding='utf-8') as file: | |
csv_reader = csv.DictReader(file) | |
for row in csv_reader: | |
node_id_to_name[row['node_id'].strip()] = row['name'].strip() | |
# Read csv1 and replace node IDs with names using the mapping, then write to the new CSV | |
with open(args.csv1, mode='r', newline='', encoding='utf-8') as read_file, \ | |
open(args.output_csv, mode='w', newline='', encoding='utf-8') as write_file: | |
csv_reader = csv.DictReader(read_file) | |
# Define the new headers, assuming you want 'source' and 'target' to keep the original names | |
fieldnames = ['source', 'target'] | |
csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames) | |
csv_writer.writeheader() | |
for row in csv_reader: | |
# Use the mapping to replace the node ID with the corresponding name | |
source_name = node_id_to_name.get(row['source'].strip(), row['source']) | |
target_name = node_id_to_name.get(row['target'].strip(), row['target']) | |
csv_writer.writerow({'source': source_name, 'target': target_name}) | |
print(f"Names have been replaced in {args.output_csv}.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
import pandas as pd | |
# Define the paths to the original data | |
knowledge_graph_path = 'orig_data/knowledge-graph.csv' | |
entities_path = 'orig_data/nodes-entities.csv' | |
intermediaries_path = 'orig_data/nodes-intermediaries.csv' | |
officers_path = 'orig_data/nodes-officers.csv' | |
# Step 1: Run fuzzy_compare on various CSVs | |
print("And awaaaay we go! \n\n...fuzzy compare of kg to entities\n") | |
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, entities_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95']) | |
print("... fuzzy compare of kg to intermediaries\n") | |
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, intermediaries_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95']) | |
print("... fuzzy compare of kg to officers\n") | |
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, officers_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95']) | |
# Step 2: Process the matched rows to generate relationships | |
print("...digging out relationships...") | |
subprocess.run(['python', '2_relationshipper.py']) | |
# Manually selecting the first three columns of data is replaced by pandas | |
print("...sorting out relationships...") | |
df = pd.read_csv('output_relationships.csv', usecols=[0, 1, 2]) | |
df.columns = ['source', 'target', 'rel_type'] | |
df.to_csv('thirdstep.csv', index=False) | |
# Step 3: Compare node IDs and append to laststep.csv | |
print("...matching names to ids...") | |
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', entities_path, '--output_csv', 'laststep.csv']) | |
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', intermediaries_path, '--output_csv', 'laststep.csv']) | |
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', officers_path, '--output_csv', 'laststep.csv']) | |
# Final step: Combine the third step data with last step | |
print("...just about done...") | |
subprocess.run(['python', '4_last_step.py', 'thirdstep.csv', 'laststep.csv']) | |
# Function to rearrange names from 'Lastname - Firstname Middlename' to 'Firstname Middlename Lastname' | |
def rearrange_names(name): | |
if '-' in name: | |
parts = name.split(' - ') | |
if len(parts) == 2: | |
return f"{parts[1]} {parts[0]}" | |
return name # Return the name unchanged if it doesn't fit the pattern | |
# Last element of workflow.py, rearrange names in the named_relationships.csv | |
def rearrange_names_in_csv(csv_file_path): | |
df = pd.read_csv(csv_file_path) | |
df['source'] = df['source'].apply(rearrange_names) | |
df.to_csv(csv_file_path, index=False) # Overwrite the original CSV with the rearranged names | |
# Call the function to rearrange names | |
rearrange_names_in_csv('named_relationships.csv') | |
print("Names have been rearranged in named_relationships.csv.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment