mcminis1 · February 1, 2023 19:44
diff --git a/gpt_index.py b/gpt_index.py
 from bs4 import BeautifulSoup
 import re

 from gpt_index import Document
 from manifest import Manifest
 from langchain.llms.manifest import ManifestWrapper
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from gpt_index import LangchainEmbedding
 from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper


 docs = []
 for doc_index in ['gpt-index-latest', 'langchain-latest']:
    html = open(f'sites/{doc_index}/index.html','r').read()


    # parsing the html file
    soup = BeautifulSoup(html, 'html.parser')

    def next_element(elem):
        while elem is not None:
            # Find next element, skip NavigableString objects
            elem = elem.next_sibling
            if hasattr(elem, 'name'):
                return elem

    re_pattern = re.compile('^h[1-6]')
    headers = soup.find_all(re_pattern)

    document_sections = []
    for header in headers:
        section = 'Library: ' + doc_index + '\n'
        section += 'Title: ' + str(header.text).strip('#') + '\n'
        for link in BeautifulSoup(str(header), 'html.parser').find_all('a'):
            section += 'HREF: ' + str(link.get('href'))
        elem = next_element(header)
        while elem is not None and elem not in headers:
            if elem.text.strip() is not None:
                section += '\n' + str(elem.text.strip('\n '))
            elem = next_element(elem)
        docs.append(section)


 ### pull the documents from the text sections from the html version of "read the docs"
 # documents = [Document(t) for t in docs]
 ### pull the data from pdfs printed from "read the docs"
 documents = SimpleDirectoryReader('pdfs').load_data()


 # define prompt helper
 # set maximum input size
 max_input_size = 2048
 # set number of output tokens
 num_output = 512
 # set maximum chunk overlap
 max_chunk_overlap = 256
 prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

 # python3 -m manifest.api.app --model_type huggingface --model_name_or_path google/flan-t5-xl --fp16 --device 0
 manifest = Manifest(
    client_name = "huggingface",
    client_connection = "http://127.0.0.1:5000",
 )
 print(manifest.client.get_model_params())

 embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
 llm_predictor = LLMPredictor(llm=ManifestWrapper(client=manifest, llm_kwargs={"temperature":1e-2, "max_tokens": 256}))
 index = GPTSimpleVectorIndex(
    documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model
 )

 response = index.query("who wrote LangChain?")
 print(response)
	from bs4 import BeautifulSoup
	import re

	from gpt_index import Document
	from manifest import Manifest
	from langchain.llms.manifest import ManifestWrapper
	from langchain.embeddings.huggingface import HuggingFaceEmbeddings
	from gpt_index import LangchainEmbedding
	from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper


	docs = []
	for doc_index in ['gpt-index-latest', 'langchain-latest']:
	html = open(f'sites/{doc_index}/index.html','r').read()


	# parsing the html file
	soup = BeautifulSoup(html, 'html.parser')

	def next_element(elem):
	while elem is not None:
	# Find next element, skip NavigableString objects
	elem = elem.next_sibling
	if hasattr(elem, 'name'):
	return elem

	re_pattern = re.compile('^h[1-6]')
	headers = soup.find_all(re_pattern)

	document_sections = []
	for header in headers:
	section = 'Library: ' + doc_index + '\n'
	section += 'Title: ' + str(header.text).strip('#') + '\n'
	for link in BeautifulSoup(str(header), 'html.parser').find_all('a'):
	section += 'HREF: ' + str(link.get('href'))
	elem = next_element(header)
	while elem is not None and elem not in headers:
	if elem.text.strip() is not None:
	section += '\n' + str(elem.text.strip('\n '))
	elem = next_element(elem)
	docs.append(section)


	### pull the documents from the text sections from the html version of "read the docs"
	# documents = [Document(t) for t in docs]
	### pull the data from pdfs printed from "read the docs"
	documents = SimpleDirectoryReader('pdfs').load_data()


	# define prompt helper
	# set maximum input size
	max_input_size = 2048
	# set number of output tokens
	num_output = 512
	# set maximum chunk overlap
	max_chunk_overlap = 256
	prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

	# python3 -m manifest.api.app --model_type huggingface --model_name_or_path google/flan-t5-xl --fp16 --device 0
	manifest = Manifest(
	client_name = "huggingface",
	client_connection = "http://127.0.0.1:5000",
	)
	print(manifest.client.get_model_params())

	embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
	llm_predictor = LLMPredictor(llm=ManifestWrapper(client=manifest, llm_kwargs={"temperature":1e-2, "max_tokens": 256}))
	index = GPTSimpleVectorIndex(
	documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model
	)

	response = index.query("who wrote LangChain?")
	print(response)