Skip to content

Instantly share code, notes, and snippets.

@j40903272
Created March 6, 2024 16:27
Show Gist options
  • Save j40903272/25af37c1869975584aa8cf65db704081 to your computer and use it in GitHub Desktop.
Save j40903272/25af37c1869975584aa8cf65db704081 to your computer and use it in GitHub Desktop.
Wikipedia entity search
import requests
import numpy as np
from collections import Counter
#from googlesearch import search as google_search
import wikipedia as wikipedia_api
DEFAULT_IGNORED_NS = ('wikipedia:', 'file:', 'portal:', 'template:', 'mediawiki:', 'user:',
'help:', 'book:', 'draft:', 'module:', 'timedtext:')
def _normalize_title(title):
output = ""
if len(title) > 0:
output += title[0].upper()
if len(title) > 1:
output += title[1:].replace('_', ' ')
return output
# https://developers.google.com/knowledge-graph
def google_entity_search(query, **kwargs):
key = "AIzaSyCLpGT1fyIPqVgkbSxN8X75Pd5bDduxHmA"
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
params = {
'query': query,
'limit': 1,
'indent': True,
'key': key,
}
params.update(**kwargs)
response = requests.get(service_url, params=params).json()
for element in response['itemListElement']:
print(element['result']['name'])
print(element['result']['description'])
print(element['result']['detailedDescription'])
print(element['result']['@type'])
print(element['resultScore'])
def google_search(query, **kwargs):
key = "AIzaSyCLpGT1fyIPqVgkbSxN8X75Pd5bDduxHmA"
lang = "lang_en"
params = {
'q':query,
"key":key,
'lr':lang
}
params.update(**kwargs)
# restricted to wikipedia only
cx = "009719884197649911529:2nyd4xipros"
service_url = "https://www.googleapis.com/customsearch/v1/siterestrict"
params['cx'] = cx
response = requests.get(service_url, params=params).json()
if 'items' in response:
return [item['link'] for item in response['items']]
# search all
cx = "009719884197649911529:f5yfirrdx7o"
service_url = "https://www.googleapis.com/customsearch/v1"
params['cx'] = cx
response = requests.get(service_url, params=params).json()
if 'items' in response:
return [item['link'] for item in response['items']]
return []
def wikipedia_search(query, **kwargs):
wiki_results = Counter()
search = wikipedia_api.search(keyword, results=3)
if search:
for candidate in search:
page = wikipedia_api.page(candidate, auto_suggest=False)
if any([page.title.lower().startswith(ns) for ns in DEFAULT_IGNORED_NS]):
continue
wiki_results[_normalize_title(page.title)] += 1
return wiki_results
def bfs(seed: List[str], breadth: int = 3, topk: int = 3):
queue = seed
entities = set(seed)
for _ in range(breadth):
next_queue = set()
for keyword in queue:
search_result = wikipedia_api.search(keyword, results=topk)
for candidate in search:
page = wikipedia_api.page(candidate, auto_suggest=False)
if any([page.title.lower().startswith(ns) for ns in DEFAULT_IGNORED_NS]):
continue
for i in page.links:
next_queue.add(i)
entities |= next_queue
queue = next_queue
return entities
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment