Last active
July 3, 2024 01:44
-
-
Save htlin222/5eb77d647635664b03327055839520f9 to your computer and use it in GitHub Desktop.
This script is to extract the text from a PDF file and convert it to JSON format. Used for foundation_one NGS report, for biomarker_findings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# title: main | |
# author: Hsieh-Ting Lin, the Lizard 🦎 | |
# description: This script is to extract the text from a PDF file and convert it to JSON format. Used for foundation_one NGS report, for biomarker_findings | |
# date: "2024-07-01" | |
# --END-- # | |
import json | |
from openai import OpenAI | |
from PyPDF2 import PdfReader | |
# 載入PDF | |
pdf_path = "./foundation_one_demopdf.pdf" | |
reader = PdfReader(pdf_path) | |
# 確保PDF至少有2頁 | |
if len(reader.pages) < 2: | |
print("The PDF has less than 2 pages.") | |
else: | |
# 提取第一頁的文字 | |
page1 = reader.pages[0] # 索引0代表第一頁 | |
text1 = page1.extract_text() | |
# 提取第二頁的文字 | |
page2 = reader.pages[1] # 索引1代表第二頁 | |
text2 = page2.extract_text() | |
client = OpenAI() | |
# 組合 text1 和 text2,並加入提示 | |
template = """ | |
{\n \"biomarker_findings\": {\n \"microsatellite_status\": ...\n \"tumor_mutational_burden\": ...\n \"genomic_findings\": {\n ...\n },\n \"disease_relevant_genes\": ...\n }\n}\n | |
""" | |
combined_text = f"Convert these texts to JSON format, with template{template}:\n\nPage 1:\n{text1}\n\nPage 2:\n{text2}" | |
response = client.chat.completions.create( | |
model="gpt-3.5-turbo", messages=[{"role": "user", "content": combined_text}] | |
) | |
# print(response) | |
# 獲取 # 獲取回應內容 | |
result = response.choices[0].message.content | |
# 將結果儲存為 JSON 檔案 | |
with open("result.json", "w") as json_file: | |
json.dump(json.loads(result), json_file, ensure_ascii=False, indent=4) |
requirements.txt
annotated-types==0.7.0
anyio==4.4.0
certifi==2024.6.2
distro==1.9.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
idna==3.7
openai==1.35.7
pycryptodome==3.20.0
pydantic==2.8.0
pydantic-core==2.20.0
pypdf2==3.0.1
sniffio==1.3.1
tqdm==4.66.4
typing-extensions==4.12.2
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Result: