Last active
March 25, 2024 19:21
-
-
Save tos-kamiya/055745d216edd965ff2631db5ad94714 to your computer and use it in GitHub Desktop.
flexcomm: flexible comm utility, handle three or more files with use-specified predicates.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import ast | |
import sys | |
from typing import List, TextIO, Optional | |
def get_variables(expr: str) -> List[str]: | |
""" | |
Given an expression, return a sorted list of variable names used in the expression. | |
Args: | |
expr (str): The expression to be parsed. | |
Returns: | |
List[str]: A sorted list of variable names used in the expression. | |
""" | |
tree = ast.parse(expr, mode="eval") | |
variables = [node.id for node in ast.walk(tree) if isinstance(node, ast.Name)] | |
return sorted(set(variables)) | |
def evaluate_expressions( | |
outp: TextIO, expressions: List[str], inps: List[TextIO], input_file_names: Optional[List[str]] = None | |
): | |
""" | |
Evaluate the given expressions on the input files and print the results to the output file. | |
Args: | |
outp (TextIO): The output file where the results will be printed. | |
expressions (List[str]): A list of expressions to be evaluated. | |
inps (List[TextIO]): A list of input files to be processed. | |
input_file_names (Optional[List[str]], optional): A list of input file names. If provided, it will be used for error reporting. Defaults to None. | |
""" | |
num_files = len(inps) | |
vars = [chr(97 + i) for i in range(num_files)] # Generate variable names 'a', 'b', 'c', ... | |
pred_funcs = [ | |
eval(f"lambda {', '.join(vars)}: " + e) for e in expressions | |
] # Compile expressions into lambda functions | |
current_items = [inp.readline().strip() for inp in inps] # Read the first line from each input file | |
min_item = min(filter(None, current_items), default=None) # Find the minimum non-empty item | |
while min_item is not None: | |
vars_values = [ | |
(1 if ci == min_item else 0) for ci in current_items | |
] # Generate variable values for the predicates | |
values = [pf(*vars_values) > 0 for pf in pred_funcs] # Evaluate the predicates | |
if any(values): | |
print("\t".join((min_item if v else "") for v in values), file=outp) # Print the result to the output file | |
for i, inp in enumerate(inps): | |
ci = current_items[i] | |
if ci is None: | |
continue | |
if ci == min_item: | |
next_item = inp.readline().strip() | |
if next_item and next_item <= ci: | |
if input_file_names: | |
print(f"Error: File {input_file_names[i]} is not sorted", file=sys.stderr) | |
else: | |
print(f"Error: {i + 1}th file is not sorted", file=sys.stderr) | |
sys.exit(1) | |
current_items[i] = next_item | |
min_item = min(filter(None, current_items), default=None) # Find the new minimum non-empty item | |
def main(): | |
""" | |
The main function that parses command-line arguments and runs the program. | |
""" | |
parser = argparse.ArgumentParser(description="flexcomm - set operations on sorted files") | |
parser.add_argument("files", nargs="+", help="input files") | |
parser.add_argument( | |
"-p", "--predicate", action="append", required=True, help="predicate expression (e.g., 'a - b')" | |
) | |
args = parser.parse_args() | |
if len(args.files) > 26: | |
print(f"Error: Too many files.", file=sys.stderr) | |
sys.exit(1) | |
vars = [chr(97 + i) for i, _ in enumerate(args.files)] # Generate variable names 'a', 'b', 'c', ... | |
for p in args.predicate: | |
pred_vars = get_variables(p) | |
unknown_vars = sorted(set(pred_vars).difference(vars)) | |
if unknown_vars: | |
print(f"Error: Invalid variable name(s) in expression: {', '.join(unknown_vars)}", file=sys.stderr) | |
sys.exit(1) | |
inps = [open(f) for f in args.files] | |
evaluate_expressions(sys.stdout, args.predicate, inps, input_file_names=args.files) | |
for inp in inps: | |
inp.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
flexcomm
flexcomm is a utility for performing logical operations on sorted files. It can selectively output lines from the files based on the given logical expressions.
Usage
Example
The following example computes the logical intersection of
file1.txt
andfile2.txt
.Features
a
,b
,c
, etc., representing the input files.Installation
flexcomm is a single Python script, so you can use it as long as you have Python installed on your system.
License
flexcomm is in the public domain. You can freely use, copy, modify, and redistribute it without any restrictions or attribution requirements.
Contributing
Bug reports, feature requests, and pull requests are welcome!