Created
January 29, 2024 21:03
-
-
Save shawngraham/e26db0d7189df12bb0ffaf992308c2dd to your computer and use it in GitHub Desktop.
trying to build a retriever for gpt-researcher, to explore OpenContext
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
class OpenContextSearch(): | |
""" | |
Open Context Search Retriever | |
""" | |
def __init__(self, query, content_type='subjects'): | |
""" | |
Initializes the OpenContextSearch object | |
Args: | |
query: The search query (topic of interest) | |
content_type: The type of content to search for ('subjects', 'media', 'projects', or 'everything') | |
""" | |
self.query = query | |
self.content_type = content_type | |
def search(self, max_results=5): | |
""" | |
Searches the OpenContext API for records related to the query | |
Args: | |
max_results: The maximum number of results to retrieve | |
Returns: | |
A list of dictionary objects containing information about each record | |
""" | |
print(f"Searching OpenContext for '{self.query}'...") | |
base_url = "https://opencontext.org/query/.json" # Ensure correct URL structure | |
params = { | |
'q': self.query, | |
'rows': max_results | |
} | |
if self.content_type in ['subjects', 'media', 'projects']: | |
params['type'] = self.content_type | |
headers = {'User-Agent': 'oc-api-client'} | |
resp = requests.get(base_url, params=params, headers=headers) | |
print(resp.url) | |
if resp.status_code != 200: | |
print(f"Failed to retrieve data: HTTP {resp.status_code}") | |
return None | |
try: | |
search_results = resp.json() | |
except Exception as e: | |
print(f"Failed to parse the response: {e}") | |
return None | |
# Here we access the 'features' key directly | |
results = search_results.get('features', []) | |
formatted_results = [] | |
for result in results: | |
# Extract and format relevant data from each item | |
formatted_result = { | |
"label": result.get('label', 'No label provided'), | |
"id": result.get('id', 'No ID provided'), | |
"uri": result.get('properties', {}).get('uri', 'No URI provided'), | |
"category": result.get('properties', {}).get('item category', 'No category provided'), | |
# Add more fields as necessary | |
} | |
formatted_results.append(formatted_result) | |
return formatted_results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment