Created
February 1, 2023 19:44
-
-
Save mcminis1/a9f18c8ca518ada8e11259e6aeb699d3 to your computer and use it in GitHub Desktop.
create and query a gpt_index using the documentation from gpt_index and LangChain
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
from gpt_index import Document | |
from manifest import Manifest | |
from langchain.llms.manifest import ManifestWrapper | |
from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
from gpt_index import LangchainEmbedding | |
from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper | |
docs = [] | |
for doc_index in ['gpt-index-latest', 'langchain-latest']: | |
html = open(f'sites/{doc_index}/index.html','r').read() | |
# parsing the html file | |
soup = BeautifulSoup(html, 'html.parser') | |
def next_element(elem): | |
while elem is not None: | |
# Find next element, skip NavigableString objects | |
elem = elem.next_sibling | |
if hasattr(elem, 'name'): | |
return elem | |
re_pattern = re.compile('^h[1-6]') | |
headers = soup.find_all(re_pattern) | |
document_sections = [] | |
for header in headers: | |
section = 'Library: ' + doc_index + '\n' | |
section += 'Title: ' + str(header.text).strip('#') + '\n' | |
for link in BeautifulSoup(str(header), 'html.parser').find_all('a'): | |
section += 'HREF: ' + str(link.get('href')) | |
elem = next_element(header) | |
while elem is not None and elem not in headers: | |
if elem.text.strip() is not None: | |
section += '\n' + str(elem.text.strip('\n ')) | |
elem = next_element(elem) | |
docs.append(section) | |
### pull the documents from the text sections from the html version of "read the docs" | |
# documents = [Document(t) for t in docs] | |
### pull the data from pdfs printed from "read the docs" | |
documents = SimpleDirectoryReader('pdfs').load_data() | |
# define prompt helper | |
# set maximum input size | |
max_input_size = 2048 | |
# set number of output tokens | |
num_output = 512 | |
# set maximum chunk overlap | |
max_chunk_overlap = 256 | |
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap) | |
# python3 -m manifest.api.app --model_type huggingface --model_name_or_path google/flan-t5-xl --fp16 --device 0 | |
manifest = Manifest( | |
client_name = "huggingface", | |
client_connection = "http://127.0.0.1:5000", | |
) | |
print(manifest.client.get_model_params()) | |
embed_model = LangchainEmbedding(HuggingFaceEmbeddings()) | |
llm_predictor = LLMPredictor(llm=ManifestWrapper(client=manifest, llm_kwargs={"temperature":1e-2, "max_tokens": 256})) | |
index = GPTSimpleVectorIndex( | |
documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model | |
) | |
response = index.query("who wrote LangChain?") | |
print(response) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment