Last active
October 7, 2023 04:57
-
-
Save Daethyra/1b39793953fb3fdb896867c39b172feb to your computer and use it in GitHub Desktop.
Python module for Querying Local Documents w/ LangChain and Chroma
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
from dotenv import load_dotenv | |
from retrying import retry | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.llms import OpenAI as OpenAILLM | |
from langchain.chains.question_answering import load_qa_chain | |
# Define the retrying decorator for specific functions | |
def retry_if_value_error(exception): | |
"""Return True if we should retry (in this case when it's a ValueError), False otherwise""" | |
return isinstance(exception, ValueError) | |
def retry_if_file_not_found_error(exception): | |
"""Return True if we should retry (in this case when it's a FileNotFoundError), False otherwise""" | |
return isinstance(exception, FileNotFoundError) | |
class PDFProcessor: | |
""" | |
A class to handle PDF document processing, similarity search, and question answering. | |
Attributes | |
---------- | |
OPENAI_API_KEY : str | |
OpenAI API Key for authentication. | |
embeddings : OpenAIEmbeddings | |
Object for OpenAI embeddings. | |
llm : OpenAILLM | |
Language model for generating embeddings. | |
Methods | |
------- | |
get_user_query(prompt="Please enter your query: "): | |
Get query from the user. | |
load_pdfs_from_directory(directory_path='data/'): | |
Load PDFs from a specified directory. | |
_load_and_split_document(file_path, chunk_size=2000, chunk_overlap=0): | |
Load and split a single document. | |
perform_similarity_search(docsearch, query): | |
Perform similarity search on documents. | |
""" | |
def __init__(self): | |
"""Initialize PDFProcessor with environment variables and reusable objects.""" | |
self._load_env_vars() | |
self._initialize_reusable_objects() | |
@retry(retry_on_exception=retry_if_value_error, stop_max_attempt_number=3) | |
def _load_env_vars(self): | |
"""Load environment variables.""" | |
try: | |
load_dotenv() | |
self.OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'sk-') | |
if not self.OPENAI_API_KEY: | |
raise ValueError("OPENAI_API_KEY is missing. Please set the environment variable.") | |
except ValueError as ve: | |
print(f"ValueError encountered: {ve}") | |
raise | |
def _initialize_reusable_objects(self): | |
"""Initialize reusable objects like embeddings and language models.""" | |
self.embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY) | |
self.llm = OpenAILLM(temperature=0, openai_api_key=self.OPENAI_API_KEY) | |
@staticmethod | |
def get_user_query(prompt="Please enter your query: "): | |
""" | |
Get user input for a query. | |
Parameters: | |
prompt (str): The prompt to display for user input. | |
Returns: | |
str: User's query input. | |
""" | |
return input(prompt) | |
@retry(retry_on_exception=retry_if_file_not_found_error, stop_max_attempt_number=3) | |
def load_pdfs_from_directory(self, directory_path='data/'): | |
""" | |
Load all PDF files from a given directory. | |
Parameters: | |
directory_path (str): Directory path to load PDFs from. | |
Returns: | |
list: List of text chunks from all loaded PDFs. | |
""" | |
try: | |
if not os.path.exists(directory_path): | |
raise FileNotFoundError(f"The directory {directory_path} does not exist.") | |
pdf_files = glob.glob(f"{directory_path}/*.pdf") | |
if not pdf_files: | |
raise FileNotFoundError(f"No PDF files found in the directory {directory_path}.") | |
all_texts = [] | |
for pdf_file in pdf_files: | |
all_texts.extend(self._load_and_split_document(pdf_file)) | |
return all_texts | |
except FileNotFoundError as fe: | |
print(f"FileNotFoundError encountered: {fe}") | |
raise | |
def _load_and_split_document(self, file_path, chunk_size=2000, chunk_overlap=0): | |
""" | |
Load and split a PDF document into text chunks. | |
Parameters: | |
file_path (str): Path to the PDF file. | |
chunk_size (int): Size of each text chunk. | |
chunk_overlap (int): Overlapping characters between chunks. | |
Returns: | |
list: List of text chunks. | |
""" | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"The file {file_path} does not exist.") | |
loader = PyPDFLoader(file_path) | |
data = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
return text_splitter.split_documents(data) | |
def perform_similarity_search(self, docsearch, query): | |
""" | |
Perform similarity search on documents based on a query. | |
Parameters: | |
docsearch (Chroma): Chroma object containing document vectors. | |
query (str): User query for similarity search. | |
Returns: | |
list: List of similar documents or chunks. | |
""" | |
if not query: | |
raise ValueError("Query should not be empty.") | |
return docsearch.similarity_search(query) | |
if __name__ == "__main__": | |
try: | |
# Initialize PDFProcessor class | |
pdf_processor = PDFProcessor() | |
# Load PDFs from directory and count the number of loaded documents | |
texts = pdf_processor.load_pdfs_from_directory() | |
num_docs = len(texts) | |
print(f'Loaded {num_docs} document(s).') | |
# Create a Chroma object for document similarity search | |
docsearch = Chroma.from_documents(texts, pdf_processor.embeddings) | |
# Load a QA chain | |
chain = load_qa_chain(pdf_processor.llm, chain_type="stuff") | |
# Get user query for similarity search | |
query = pdf_processor.get_user_query() | |
# Perform similarity search based on the query | |
result = pdf_processor.perform_similarity_search(docsearch, query) | |
# Run the QA chain on the result | |
chain.run(input_documents=result, question=query) | |
except Exception as e: | |
print(f"An error occurred: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment