Skip to content

Instantly share code, notes, and snippets.

@pjmagee
Created July 27, 2024 21:08
Show Gist options
  • Save pjmagee/83cb204255b475e2a6d4824f10816f6d to your computer and use it in GitHub Desktop.
Save pjmagee/83cb204255b475e2a6d4824f10816f6d to your computer and use it in GitHub Desktop.
2024-07-27 22:07:52,209 - INFO - Making request to https://starwars.fandom.com/api.php?action=parse&page=Aaricetri&format=json&prop=sections%7Cproperties
2024-07-27 22:07:52,427 - INFO - Making request to https://starwars.fandom.com/api.php?action=parse&page=Aaricetri&format=json&prop=categories
2024-07-27 22:07:52,641 - INFO - {
"title": "Aaricetri",
"sections": [
"Overview [ ] Two types of planets existed in the galaxy : gas giants and terrestrial rocky worlds. Moons orbiting gas giants and other rocky worlds were usually not considered planets, even though they could be larger or more populated than other planets. However, Zonama Sekot was considered to be both at different time periods due to its unique mobile nature. Some planets, like Balnab , were still going through stages of organic development Depending on the tilt of its axis, a planet might have had varied seasons with changing temperatures and weather, affecting its habitability. Planets were extremely varied in terms of environmental conditions and landscape, ranging from completely oceanic, such as Manaan , Dac , and Kamino , to thick arboreal rainforests, such as Kashyyyk and Felucia . Some planets, such as Hoth , were completely frozen, while other planets, such as Nelvaan , were experiencing an ice age. Some planets, like Geonosis , Korriban , and Tatooine , were completely barren and desolate, covered in little more than rock and sand; others, such as Coruscant and Taris , became an ecumenopolis , in which the entire planetary surface was covered by one gigantic city. Many planets, like Dantooine , Naboo , Alderaan , and Corellia , contained a mix of landscapes, with various continents, oceans, and mountains. Most life-bearing planets had primarily nitrogen - and oxygen -based Type I atmospheres , though some, such as Dorin , Gand , and Skako , had unique atmospheres to which their native life forms had adapted. Gas giants , planets made up of primarily gaseous layers, were rarely inhabited, though they were sometimes used for resource collection (such as the Tibanna gas operations on Bespin ). Some planets had a large collection of space debris orbiting them, referred to as \"rings.\" These rings ranged from being thin to being expansive.",
"Sentience [ ] Sentience among planets, while rare, was not unheard of. Yuuzhan'tar , Zonama Sekot and D'vouran were three notable examples of sentient planets."
],
"categories": [
"Pages_using_DynamicPageList3_parser_function",
"Legends_articles",
"Articles_from_unlicensed_sources",
"Locations_without_grid_coordinates",
"Planets"
],
"infoboxes": [
{
"title": "Aaricetri",
"Astrographical information": {
"Region": {
"value": "Imperial Outer Rim",
"links": [
{
"href": "/wiki/Outer_Rim_Territories/Legends",
"text": "Imperial Outer Rim"
}
]
},
"System": {
"value": "Nass Diona system",
"links": []
},
"Orbital position": {
"value": "4",
"links": []
}
},
"Physical information": {
"Points of interest": {
"value": "Docking Bay 56 Wildnite",
"links": []
}
},
"Societal information": {
"Native species": {
"value": "Aaricetrian",
"links": []
},
"Demonym": {
"value": "Aaricetrian",
"links": []
},
"Major cities": {
"value": "Penshakka",
"links": []
}
}
}
]
}
import json
import logging
import re
from urllib.parse import urlencode
import openai
import requests
from bs4 import BeautifulSoup
openai.api_key = ''
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
url = "https://starwars.fandom.com/api.php"
ignore_sections = [
"Appearances",
"Sources",
"Notes and references",
"External links",
"Behind the scenes",
"Non-canon appearances",
"Real-world similarities",
"Non-canon sources"
]
def get_category_members(name: str = "Planets", limit: int = 10):
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": "Category:" + name,
"cmlimit": limit,
"format": "json"
}
full_url = f"{url}?{urlencode(params)}"
logging.info(f"Making request to {full_url}")
response = requests.get(url, params=params)
data = response.json()
return data['query']['categorymembers']
def get_page_props(title: str):
params = {
"action": "parse",
"page": title,
"format": "json",
"prop": "sections|properties"
}
full_url = f"{url}?{urlencode(params)}"
logging.info(f"Making request to {full_url}")
response = requests.get(url, params=params)
data = response.json()
all_sections = data['parse']['sections']
page_sections = [section for section in all_sections if section['line'] not in ignore_sections]
page_infoboxes = []
for property in data['parse']['properties']:
if property['name'] == 'infoboxes' and property['*']:
page_infoboxes.append(parse_infobox(property['*']))
return page_sections, page_infoboxes
def get_section_content(page_title, section_index):
params = {
"action": "parse",
"page": page_title,
"prop": "text",
"section": section_index,
"format": "json"
}
full_url = f"{url}?{urlencode(params)}"
logging.info(f"Making request to {full_url}")
response = requests.get(url, params=params)
data = response.json()
return data["parse"]["text"]["*"]
def get_page_categories(title: str):
params = {
"action": "parse",
"page": title,
"format": "json",
"prop": "categories"
}
full_url = f"{url}?{urlencode(params)}"
logging.info(f"Making request to {full_url}")
response = requests.get(url, params=params)
data = response.json()
categories = [cat["*"] for cat in data["parse"]["categories"]]
return categories
def create_function_fill_planet_schema():
return {
"type": "function",
"function": {
"name": "fill_planet_schema",
"description": "Fill in the planet schema based on the provided text from the user",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string"},
"region": {"type": "string"},
"sector": {"type": "string"},
"system": {"type": "string"},
"stars": {"type": "array", "items": {"type": "string"}},
"position": {"type": "string"},
"moons": {"type": "array", "items": {"type": "string"}},
"coord": {
"$comment": "The coordinates of the planet (Galactic Standard)",
"type": "string"
},
"xyz": {"type": "string"},
"routes": {"type": "array", "items": {"type": "string"}},
"distance": {"type": "string"},
"lengthday": {"type": "string"},
"lengthyear": {"type": "string"},
"class": {"type": "string"},
"diameter": {"type": "string"},
"atmosphere": {
"$comment": "The atmosphere of the planet",
"type": "array", "items": {"type": "string"}
},
"climate": {"type": "array", "items": {"type": "string"}},
"gravity": {"type": "string"},
"terrain": {"type": "array", "items": {"type": "string"}},
"water": {"type": "string"},
"interest": {"type": "string"},
"flora": {"type": "array", "items": {"type": "string"}},
"fauna": {"type": "array", "items": {"type": "string"}},
"otherlife": {"type": "array", "items": {"type": "string"}},
"species": {"type": "array", "items": {"type": "string"}},
"otherspecies": {"type": "array", "items": {"type": "string"}},
"socialgroup": {"type": "string"},
"languages": {"type": "array", "items": {"type": "string"}},
"government": {"type": "string"},
"population": {"type": "number"},
"demonym": {"type": "string"},
"cities": {"type": "array", "items": {"type": "string"}},
"imports": {"type": "array", "items": {"type": "string"}},
"exports": {"type": "array", "items": {"type": "string"}},
"affiliations": {"type": "array", "items": {"type": "string"}},
"isCanon": {"type": "boolean"}
},
"required": ["name"]
}
}
}
def call_openai_function(content, function_definition):
"""
Call the OpenAI API to fill in the planet schema based on the provided text
"""
response = openai.chat.completions.create(
messages=[
{
"role": "system",
"content": """
You are an assistant that fills in the Planet Schema
"""
},
{
"role": "user",
"content": json.dumps(content)
}
],
model="gpt-4o",
tools=[function_definition],
tool_choice="auto"
)
arguments = response.choices[0].message.tool_calls[0].function.arguments
return json.loads(arguments)
class Output:
title: str
sections: list[str] = []
categories: list[str] = []
infoboxes: list[dict] = []
def to_dict(self):
return {
"title": self.title,
"sections": self.sections,
"categories": self.categories,
"infoboxes": self.infoboxes
}
def text_cleanup(html: str):
soup = BeautifulSoup(html, "html.parser")
for footnote in soup.find_all("sup"):
footnote.decompose()
for stub in soup.find_all(attrs={"class": "stub"}):
stub.decompose()
text = soup.get_text(strip=True, separator=" ")
return text
def parse_infobox(json_string):
infobox_data = json.loads(json_string)
parsed_data = {}
for item in infobox_data[0]['data']:
if item['type'] == 'image':
parsed_data['image'] = item['data'][0]['url']
elif item['type'] == 'title':
parsed_data['title'] = item['data']['value']
elif item['type'] == 'group':
group_name = item['data']['value'][0]['data']['value']
parsed_data[group_name] = {}
for group_item in item['data']['value']:
if group_item['type'] == 'data':
soup_label = BeautifulSoup(group_item['data']['label'], "html.parser")
for tag in soup_label.find_all("sup"):
tag.decompose()
soup_value = BeautifulSoup(group_item['data']['value'], "html.parser")
for tag in soup_value.find_all("sup"):
tag.decompose()
links = []
for link in soup_value.find_all("a", href=True):
links.append({
"href": link['href'],
"text": link.get_text()
})
label = soup_label.get_text(strip=True, separator=" ").replace(" ", " ")
value = soup_value.get_text(strip=True, separator=" ").replace(" ", " ")
parsed_data[group_name][label] = {
"value": value,
"links": links
}
return parsed_data
def main():
pages = get_category_members(name="Planets", limit=500)
for page in pages:
title = page['title']
page_sections, page_infoboxes = get_page_props(title)
o = Output()
o.title = title
o.categories = get_page_categories(title)
for page_section in page_sections:
section_html = get_section_content(page['title'], page_section['index'])
section_text = text_cleanup(section_html)
o.sections.append(section_text)
for ib in page_infoboxes:
o.infoboxes.append(ib)
logging.info(json.dumps(o.to_dict(), indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment