Skip to content

Instantly share code, notes, and snippets.

@LEv145
Last active November 20, 2023 21:27
Show Gist options
  • Save LEv145/09d94f46ebb06e9f1cf1f0a5df736467 to your computer and use it in GitHub Desktop.
Save LEv145/09d94f46ebb06e9f1cf1f0a5df736467 to your computer and use it in GitHub Desktop.
Парсинг предметов с https://ru.wowhead.com через Selenium (Скрипт сделан мною очень давно)
import re
import random
import ujson
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(executable_path="./chromedriver", desired_capabilities=caps, chrome_options=chrome_options)
temp = []
counter = 0
data = []
def save_image(url, name):
img = requests.get(url)
path = f"items/{name}.jpg"
with open(path, "wb") as f:
f.write(img.content)
return path
for c1 in range(-2, 4):
for c2 in range(13):
for i in range(0, 1651, 50):
url = f"https://ru.wowhead.com//items/side:{c1}/class:{c2}#items;{i}"
print(url)
text = browser.get(url)
soup = BeautifulSoup(browser.page_source, 'lxml')
box = soup.find_all('tr', 'listview-row')
for item in box:
icon = item.find('div', 'iconmedium')
url = re.search(r'https://.+\.jpg', icon.ins['style']).group(0)
name = item.find('a', class_=re.compile(r'.+ listview-cleartext')).text
if url not in temp:
temp.append(url)
r = random.randint(1, 100)
rare = None
if r == 1:
rare = 'mystery'
elif 1 < r < 5:
rare = 'legend'
elif 5 < r < 15:
rare = 'epic'
elif 15 < r < 45:
rare = 'rare'
else:
rare = 'normal'
path = save_image(url, f"item_{counter}")
counter += 1
print(counter, name, path, rare)
data.append({'name': name, 'path': path, 'rare': rare})
browser.execute_script("window.open('','_blank');")
browser.close()
browser.switch_to.window(browser.window_handles[-1])
print(len(data))
with open('items.json', 'w') as f:
ujson.dump(data, f, indent=4, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment