Skip to content

Instantly share code, notes, and snippets.

@FoobarProtocol
Created October 21, 2023 23:48
Show Gist options
  • Save FoobarProtocol/89905a5d523ded301f91f286bfc7e8c6 to your computer and use it in GitHub Desktop.
Save FoobarProtocol/89905a5d523ded301f91f286bfc7e8c6 to your computer and use it in GitHub Desktop.
This is a really robust self-created script that partitions CSV files contingent on user input so let's get to using it shall we?
import argparse
import csv
import os
# Function to calculate total rows in CSV
def get_total_rows(csv_file):
with open(csv_file, 'r') as f:
return sum(1 for row in csv.reader(f)) - 1 # Exclude header
# Function to split CSV files
def split_csv(input_file, row_limit, output_dir=None):
total_rows = get_total_rows(input_file)
# Error checking for row_limit
is_percentage = row_limit.endswith('p')
if is_percentage:
percentage = int(row_limit.rstrip('p'))
if percentage > 100:
print("Error: Percentage cannot exceed 100%")
return
row_limit = int((percentage / 100) * total_rows)
else:
row_limit = int(row_limit)
if row_limit > total_rows:
print(f"Error: Number of rows specified ({row_limit}) exceeds total number of rows ({total_rows})")
return
with open(input_file, 'r') as f:
csvreader = csv.reader(f)
headers = next(csvreader)
# Initialize variables
output_file_count = 0
rows_written = 0
current_output = None
csvwriter = None
remaining_rows = total_rows
while remaining_rows > 0:
current_limit = min(row_limit, remaining_rows)
if current_output:
current_output.close()
output_file_count += 1
output_file_name = f"{os.path.splitext(os.path.basename(input_file))[0]}_part{output_file_count}.csv"
output_file_path = os.path.join(output_dir, output_file_name) if output_dir else f"{input_file}_part{output_file_count}.csv"
current_output = open(output_file_path, 'w', newline='')
csvwriter = csv.writer(current_output)
csvwriter.writerow(headers)
for _ in range(current_limit):
try:
row = next(csvreader)
csvwriter.writerow(row)
rows_written += 1
except StopIteration:
break
remaining_rows -= current_limit
if current_output:
current_output.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Split a large CSV file into smaller CSV files.')
# Input CSV file
parser.add_argument('-i', '--input', type=str, help='Path to the large CSV file that needs to be split.')
# Row limit for each smaller CSV file
parser.add_argument('-r', '--rows', type=str, help='Number or percentage of rows in each smaller CSV file.')
# Output directory
parser.add_argument('-o', '--output', type=str, help='Output directory for smaller CSV files.')
args = parser.parse_args()
if args.input:
input_file = args.input
else:
input_file = input("Enter the name or full path of the CSV file: ")
if args.rows:
row_limit = args.rows
else:
row_limit = input("Enter the number of rows or percentage (e.g., 100 or 20p) for each partition: ")
split_csv(input_file, row_limit, args.output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment