Last active
November 19, 2022 04:38
-
-
Save KnightChaser/f0a15345d31cb1b11049d32bcedc4c32 to your computer and use it in GitHub Desktop.
구글 검색 결과 개수, 시간, 제목과 그 링크, 처리하는데 걸린 시간을 담아서 반환하는 코드. 구글 인덱스 파싱하는 코드가 안 보여서 직접 만들어 봄.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
import re | |
import random | |
class GoogleParsingConst: | |
index_name_class_name = "LC20lb MBeuO DKV0Md" | |
index_url_class_name = "iUh30 qLRx3b tjvcx" | |
html_tag_del_regex = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') | |
html_entity_del_regex = re.compile('&(?:[a-z\d]+|#\d+|#x[a-f\d]+)') | |
user_agent_string_list = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0', | |
'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35' | |
] | |
def get_google_search_index(keyword): | |
if keyword == None: | |
return False | |
start_time = time.time() | |
try: | |
url = f"https://www.google.com/search?q={keyword}" | |
headers = {"user-agent" : random.choice(GoogleParsingConst.user_agent_string_list)} | |
res = requests.get(url, headers = headers) | |
soup = BeautifulSoup(res.text, 'html.parser') | |
except Exception as e: | |
return f"exception occured : {e}" | |
result_stat = soup.select_one("#result-stats").text | |
result_count = (lambda data: int(data.split(' ')[2].replace('개', '').replace(',', '')))(result_stat) # how many results? | |
result_time = (lambda data: float(data.split(' ')[3].replace('(', '').replace(')', '').replace('초', '')))(result_stat) # how much time Google spent? | |
result_index_name_list = (lambda data: data.find_all("h3", {"class" : GoogleParsingConst.index_name_class_name}))(soup) | |
result_index_url_list = (lambda data: data.find_all("cite", {"class" : GoogleParsingConst.index_url_class_name}))(soup) | |
finish_time = time.time() | |
processing_time = finish_time - start_time | |
# making results as a dictionary | |
result = {} | |
result['search_keyword'] = keyword | |
result['search_result'] = result_count | |
result['search_time'] = result_time | |
result['processing_time'] = processing_time | |
result['index'] = {} | |
_seq = 0 | |
for result_index_name, result_index_url in zip(result_index_name_list, result_index_url_list): | |
result_index_name = str(result_index_name) | |
result_index_url = str(result_index_url) | |
if _seq >= 5: # only TOP 5 result will be the maximum limit | |
break | |
result_index_name = re.sub(GoogleParsingConst.html_tag_del_regex, '', result_index_name) | |
result_index_name = re.sub(GoogleParsingConst.html_entity_del_regex, '', result_index_name) | |
result_index_url = re.sub(GoogleParsingConst.html_tag_del_regex, '', result_index_url) | |
result_index_url = re.sub(GoogleParsingConst.html_entity_del_regex, '', result_index_url) | |
result_index_url = (lambda data: data.replace("›", '/').replace(' ', ''))(result_index_url) # Google splits its url as their own way, | |
# "/" to "›" (U+203a) (* NOt ">") | |
result['index'][_seq] = { | |
'index_name' : result_index_name, | |
'index_url' : result_index_url | |
} | |
_seq += 1 | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment