Last active
April 14, 2023 19:41
-
-
Save lobstrio/5fc088d44bba8383bf3f91acb11ebd3b to your computer and use it in GitHub Desktop.
Scrape PDFs programmatically site with Python3 and Tika library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tika import parser | |
import re | |
import csv | |
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals'] | |
def parse_pdf(filename): | |
# request | |
raw = parser.from_file(filename) | |
# print(raw) | |
assert isinstance(raw, dict) | |
status = raw['status'] | |
assert status == 200 | |
# json parsing | |
metadata = raw['metadata'] | |
content_type = metadata['Content-Type'] | |
creation_date = metadata['Creation-Date'] | |
last_modified = metadata['Last-Modified'] | |
name = metadata['resourceName'] | |
content = str(raw) | |
numero_gestion = "".join(re.findall(r'(?<=N° de gestion )\w+', content)) | |
a_jour_au = "".join(re.findall(r'(?<=à jour au )[\w\s]+', content)) | |
numero_rcs = "".join(re.findall(r'(?<=Immatriculation au RCS, numéro )[\w\s\.]+', content)) | |
date_immatriculation = "".join(re.findall(r'(?<=Date d\'immatriculation )[\d\/]+', content)) | |
raison_sociale = "".join(re.findall(r'(?<=Dénomination ou raison sociale )\w+', content)) | |
forme_juridique = "".join(re.findall(r'(?<=Forme juridique )[^\\]+', content)) | |
capital_social = "".join(re.findall(r'(?<=Capital social )[^\\]+', content)) | |
adresse_siege = "".join(re.findall(r'(?<=Adresse du siège )[^\\]+', content)) | |
activites_principals = "".join(re.findall(r'(?<=Activités principales )[^\\]+', content)) | |
values = [numero_gestion, a_jour_au, numero_rcs, date_immatriculation, raison_sociale, forme_juridique, capital_social, adresse_siege, activites_principals] | |
assert all(values) | |
row_dict = dict(zip(HEADERS, values)) | |
return row_dict | |
def write_csv(rows): | |
assert rows | |
with open('parsed_pdf.csv', 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=HEADERS) | |
writer.writeheader() | |
for row in rows: | |
writer.writerow(row) | |
if __name__ == "__main__": | |
assert HEADERS | |
filenames = [ | |
"/Users/sashabouloudnine/Desktop/LOBSTR - Extrait d'immatriculation.pdf", | |
"/Users/sashabouloudnine/Desktop/VOSTOKINC - Extrait d'immatriculation.pdf", | |
"/Users/sashabouloudnine/Desktop/CAPTAIN DATA - Extrait d'immatriculation.pdf", | |
"/Users/sashabouloudnine/Desktop/PHANTOMBUSTER - Extrait d'immatriculation.pdf", | |
] | |
rows = [] | |
for filename in filenames: | |
row_dict = parse_pdf(filename) | |
rows.append(row_dict) | |
write_csv(rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment