Created
September 3, 2021 12:57
-
-
Save BHEADRICK/55138272e0c92096abe0d3abc09bd792 to your computer and use it in GitHub Desktop.
Split csv file and retain the header
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
#csv file name to be read in | |
in_csv = 'filename.csv' | |
#get the number of lines of the csv file to be read | |
number_lines = sum(1 for row in (open(in_csv))) | |
#size of rows of data to write to the csv, | |
#you can change the row size according to your need | |
rowsize = 20000 | |
#start looping through data writing it to a new file for each set | |
out_file_prefix = in_csv.split()[0] | |
hf = pd.read_csv(in_csv, nrows=1) | |
header = list(hf.columns.values) | |
part = 0 | |
for i in range(1,number_lines,rowsize): | |
part +=1 | |
df = pd.read_csv(in_csv, | |
header=None, | |
nrows = rowsize, | |
skiprows = i) | |
#csv to write data to a new file with indexed name. input_1.csv etc. | |
out_csv = out_file_prefix + '-part-' + str(part) + '.csv' | |
df.to_csv(out_csv, | |
index=False, | |
header=header, | |
mode='a',#append data to csv file | |
chunksize=rowsize)#size of data to append for each loop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment