Created
January 1, 2019 22:25
-
-
Save FilipDominec/70173979cfe644139a64191e9276c530 to your computer and use it in GitHub Desktop.
Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a pattern
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
#-*- coding: utf-8 -*- | |
""" | |
Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a | |
pattern as determined by filter_lines=... below. Prints matching names. | |
Typical invocation | |
python3 ./xlsfilter.py *201{6,7,8}.{1,2}.xlsx | vi - | |
Supposed to help find interesting spectroscopic data on a shared network drive. Can be adapted for | |
any similar task in the future. | |
""" | |
## Import common moduli | |
import sys, os, time, re | |
from pathlib import Path | |
import pandas as pd | |
import collections as co | |
matching_names, matching_dates, matching_xls = [], [], [] | |
errors = [] | |
duplicities = [] | |
for arg in sys.argv[1:]: | |
#print('loading file {:}', arg); | |
sheet_to_df_map = co.OrderedDict() | |
xls = pd.ExcelFile(arg) | |
skipped_sheet_names = [] | |
for sheet_name in xls.sheet_names: | |
if re.search('\d\d\d\d', sheet_name): | |
sheet_to_df_map[sheet_name] = xls.parse(sheet_name) | |
else: | |
skipped_sheet_names += [sheet_name] | |
#print('Note: in file "{:}" parsed sheets: {:} '.format(arg, sheet_to_df_map.keys())) | |
#print('Note: in file "{:}" ignored sheets: {:} '.format(arg, skipped_sheet_names)) | |
for key,df in list(sheet_to_df_map.items())[::-1]: | |
try: | |
filter_lines = (df['Filtr (%)']==0.1) & (df['Acqt. (s)']==1) & (df['x']==-25000) | |
new_names = list(df[filter_lines]['Name'].values) | |
for nn in new_names: | |
if nn in matching_names: duplicities.append(nn) | |
matching_names += new_names | |
matching_dates += [key[:]] * len(new_names) | |
matching_xls += [arg[:]] * len(new_names) | |
except KeyError as e: | |
errors.append("ERROR: could not parse or filter {:} in {:}:\n\t\t {:}".format(key,arg,e)) | |
#print(e) | |
matching_triplets = [n+'\t'+d+'\t'+a for (n,d,a) in zip(matching_names, matching_dates, matching_xls)] | |
#matching_triplets.sort() | |
for mp in matching_triplets: | |
print(mp) | |
for err in errors: | |
print(err) | |
if duplicities: | |
print('WARNING: found {:} duplicities in above printed names: {:}'.format(len(duplicities),duplicities)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment