Skip to content

Instantly share code, notes, and snippets.

@mcminis1
Created February 1, 2023 19:44
Show Gist options
  • Save mcminis1/a9f18c8ca518ada8e11259e6aeb699d3 to your computer and use it in GitHub Desktop.
Save mcminis1/a9f18c8ca518ada8e11259e6aeb699d3 to your computer and use it in GitHub Desktop.
create and query a gpt_index using the documentation from gpt_index and LangChain
from bs4 import BeautifulSoup
import re
from gpt_index import Document
from manifest import Manifest
from langchain.llms.manifest import ManifestWrapper
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from gpt_index import LangchainEmbedding
from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper
docs = []
for doc_index in ['gpt-index-latest', 'langchain-latest']:
html = open(f'sites/{doc_index}/index.html','r').read()
# parsing the html file
soup = BeautifulSoup(html, 'html.parser')
def next_element(elem):
while elem is not None:
# Find next element, skip NavigableString objects
elem = elem.next_sibling
if hasattr(elem, 'name'):
return elem
re_pattern = re.compile('^h[1-6]')
headers = soup.find_all(re_pattern)
document_sections = []
for header in headers:
section = 'Library: ' + doc_index + '\n'
section += 'Title: ' + str(header.text).strip('#') + '\n'
for link in BeautifulSoup(str(header), 'html.parser').find_all('a'):
section += 'HREF: ' + str(link.get('href'))
elem = next_element(header)
while elem is not None and elem not in headers:
if elem.text.strip() is not None:
section += '\n' + str(elem.text.strip('\n '))
elem = next_element(elem)
docs.append(section)
### pull the documents from the text sections from the html version of "read the docs"
# documents = [Document(t) for t in docs]
### pull the data from pdfs printed from "read the docs"
documents = SimpleDirectoryReader('pdfs').load_data()
# define prompt helper
# set maximum input size
max_input_size = 2048
# set number of output tokens
num_output = 512
# set maximum chunk overlap
max_chunk_overlap = 256
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
# python3 -m manifest.api.app --model_type huggingface --model_name_or_path google/flan-t5-xl --fp16 --device 0
manifest = Manifest(
client_name = "huggingface",
client_connection = "http://127.0.0.1:5000",
)
print(manifest.client.get_model_params())
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
llm_predictor = LLMPredictor(llm=ManifestWrapper(client=manifest, llm_kwargs={"temperature":1e-2, "max_tokens": 256}))
index = GPTSimpleVectorIndex(
documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model
)
response = index.query("who wrote LangChain?")
print(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment