Skip to content

Instantly share code, notes, and snippets.

@gccollect
Last active August 21, 2024 09:20
Show Gist options
  • Save gccollect/cda80827e2dd1181dc02850d1c899a84 to your computer and use it in GitHub Desktop.
Save gccollect/cda80827e2dd1181dc02850d1c899a84 to your computer and use it in GitHub Desktop.
Convert Plutus pdf statement to csv file with running balance
import re
import sys
import pandas as pd
import pdfplumber
re_transaction = re.compile(
r'(Transaction|Card Transaction|Service Fee|Card Transfer)[\n\s](\d{4}/\d{2}/\d{2}, \d{2}:\d{2})\s+(-?\s?.\d*.?\d{0,2})\n?(.*)')
re_deposit = re.compile(r'(Card Deposit|Deposit)[\n\s](\d{4}/\d{2}/\d{2}, \d{2}:\d{2})\s+(.\d*.?\d{0,2})()')
re_page = re.compile(r'\n\d{1,2}/\d{1,2}/\d{4} \d+ / \d+')
def clean_row(row):
return [cell for cell in row if cell is not None and cell != '']
def parse(pdf_file_loc, csv_file_loc):
with pdfplumber.open(pdf_file_loc) as pdf:
content = '\n'.join([page.extract_text_simple() for page in pdf.pages]).replace('\xa0', ' ')
columns = ['Type', 'Date', 'Amount', 'Transaction']
content = re_page.sub('', content)
deposits = re_deposit.findall(content)
transactions = re_transaction.findall(content)
df = pd.DataFrame(deposits + transactions, columns=columns)
df['Amount'] = df['Amount'].replace(r'^(-?)\s?.(\d*\.?\d{0,2})$', value=r"\1\2", regex=True).astype(float)
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%d/%m, %H:%M')
df = df.set_index('Date').sort_index()
df['Balance'] = df['Amount'].cumsum()
print(df)
df.to_csv(csv_file_loc, float_format="%.2f")
if __name__ == '__main__':
args = sys.argv[1:]
parse(*args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment