Created
November 4, 2023 23:12
-
-
Save PatWalters/854a73154feaaeadce8b3f33bf1ce121 to your computer and use it in GitHub Desktop.
Parse a larger ChemFP output file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import pandas as pd | |
from tqdm.auto import tqdm | |
if len(sys.argv) != 4: | |
print(f"usage: {sys.argv[0]} cutoff infile outfile") | |
sys.exit(1) | |
cutoff = float(sys.argv[1]) | |
infile_name = sys.argv[2] | |
outfile_name = sys.argv[3] | |
for idx,chunk in enumerate(tqdm(pd.read_csv(infile_name,sep="\t",skiprows=8,header=None, chunksize=1000))): | |
result_list = [] | |
for row in chunk.values: | |
num_sims, query = row[0:2] | |
lst = row[2:] | |
it = iter(lst) | |
for name in it: | |
sim = next(it) | |
sim_val = float(sim) | |
if sim_val < cutoff: | |
result_list.append([query, name, sim]) | |
result_df = pd.DataFrame(result_list,columns=["query","name","sim"]) | |
if idx == 0: | |
result_df.to_csv(outfile_name,index=False) | |
else: | |
result_df.to_csv(outfile_name,index=False,mode='a',header=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment