Created
May 11, 2023 00:19
-
-
Save peckjon/e0ecf60037ac54e69636e11b039c5091 to your computer and use it in GitHub Desktop.
Python script to split an input file into multiple files no longer than 3900 words each, breaking only on lines that start with the pattern "00;"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
# Set the maximum number of words per file | |
max_words = 3900 | |
# Set the pattern to split on | |
pattern = "00;" | |
# Get the input file name from the command line arguments | |
if len(sys.argv) < 2: | |
print("Usage: python split_file.py <input_file>") | |
sys.exit(1) | |
input_file = sys.argv[1] | |
# Initialize the word count and file count | |
word_count = 0 | |
file_count = 1 | |
# Create the output directory if it doesn't exist | |
os.makedirs("output", exist_ok=True) | |
# Open the input file | |
with open(input_file, "r") as f: | |
# Loop through the input file | |
for line in f: | |
# Check if the line starts with the pattern | |
if line.startswith(pattern): | |
# Check if the word count is greater than the maximum | |
if word_count >= max_words: | |
# Increment the file count | |
file_count += 1 | |
# Reset the word count | |
word_count = 0 | |
# Open the current output file | |
with open(f"output/output{file_count}.txt", "a") as out_file: | |
# Write the line to the current output file | |
out_file.write(line) | |
# Increment the word count | |
word_count += len(line.split()) | |
# Trim empty lines off the start and end of the output | |
for i in range(1, file_count + 1): | |
with open(f"output/output{i}.txt", "r") as in_file: | |
lines = in_file.readlines() | |
# Trim empty lines off the start of the output | |
while len(lines) > 0 and lines[0].strip() == "": | |
lines.pop(0) | |
# Trim empty lines off the end of the output | |
while len(lines) > 0 and lines[-1].strip() == "": | |
lines.pop() | |
with open(f"output/output{i}.txt", "w") as out_file: | |
out_file.writelines(lines) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment