Skip to content

Instantly share code, notes, and snippets.

@gleba
Last active July 5, 2019 07:56
Show Gist options
  • Save gleba/e575db7d5085e845c1e2bd0b51116024 to your computer and use it in GitHub Desktop.
Save gleba/e575db7d5085e845c1e2bd0b51116024 to your computer and use it in GitHub Desktop.
Поиск в тексте сайтов в выдаче гугла по запросу
from itertools import repeat
from bs4 import BeautifulSoup
from langdetect import detect
import requests
import hashlib
def page_to_text(url, deep_keys):
try:
page = requests.get(url)
except:
print('404')
return ['404', []]
soup = BeautifulSoup(page.content, 'lxml')
overall = []
found = []
for tag in soup.find_all("div"):
tag_text = tag.get_text()
if len(tag_text) > 300:
tt_list = tag_text.split("\n")
for text in tt_list:
if len(text) > 200 and detect(text) == 'ru':
text = text.lower()
search_results = []
for key in deep_keys:
search_results.append(text.find(key))
if max(search_results) != -1:
found.append(text)
else:
overall.append(text)
return overall, found
def rate_in(dict, key):
if key in dict:
dict[key] = dict[key] + 1
else:
dict[key] = 1
def search(query, deep_keys):
start = 0
result_tokens = []
result = []
pos = 0
uniques = {}
targets = {}
for _ in repeat(None, 30):
if start is 0:
page = requests.get("https://www.google.com/search?source=hp&q=" + query)
else:
page = requests.get("https://www.google.com/search?source=hp&q=" + query + "start=" + str(start))
start = (start + 10)
soup = BeautifulSoup(page.content, 'lxml')
for tag in soup.find_all("a"):
a_tag = tag.find("div", class_="BNeawe")
if a_tag:
href = tag.attrs['href'].split("/url?q=")
if len(href) > 1:
link = href[1].split("&sa=")[0]
# hash = hashlib.md5(link.encode()).hexdigest()
text, found = page_to_text(link, deep_keys)
is_negative = ""
domain = link.split("/")[2]
if len(found) > 0:
print(link)
is_negative = "toxic"
rate_in(targets, domain)
else:
print(a_tag.text)
rate_in(uniques, domain)
info = {
"pos": pos,
"negative": is_negative,
"link": link,
"title": a_tag.text
}
tokens = {
"all": text,
"toxic": found,
"info": info
}
result_tokens.append(tokens)
result.append(info)
pos = pos + 1
write_csv_array(result, "result.csv")
write_csv_dict(uniques, "uniques.csv")
write_csv_dict(targets, "targets.csv")
def write_csv_dict(dict, filename):
items = dict.keys()
out = "key, value"
for key in items:
out += "\n" + str(key) + ", " + str(dict[key])
f = open(filename, "w")
f.write(out)
f.close()
def write_csv_array(array, filename):
if len(array) == 0:
return
print(array[0])
head = array[0].keys()
out = ""
for v in head:
out += str(v) + ", "
for o in array:
out += "\n"
for x in head:
out += str(o[x]) + ","
f = open(filename, "w")
f.write(out)
f.close()
search("основной запрос", ["фильтр слов", "поиска по контенту"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment