Created
November 16, 2017 18:08
-
-
Save wangxiaodong/d38fb9fb09b2d2ceb3966af5dd73a511 to your computer and use it in GitHub Desktop.
抓百度图片的
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
import requests | |
import json | |
import time | |
import os | |
import hashlib | |
# 用的是就是改page_num下载需要的页数就好了 | |
# 下面下的那个main就是从第一页开始连续抓取10页,并且下载图片的 | |
# 自己改下下载的页数就好了 | |
# 会在当前文件夹下面建立一个叫 d 的文件夹, 这个对你也没啥影响的 | |
# 如果那个文件里有已经下载过了就不会再下载了,自动判断,不用管 | |
# 当前页码从0开始的 | |
page_num = 0 | |
# 下载多少页 | |
page_count = 1 | |
# 每页的图的数量默认30 | |
rn = 30 | |
# 搜索的关键字 | |
keyword = "飞机" | |
headers = {'Accept': 'text/plain, */*; q=0.01', | |
"Accept-Encoding":"gzip, deflate, br", | |
"Accept-Language":"en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4", | |
"Connection":"keep-alive", | |
"Cookie": "BDIMGISLOGIN=0; winWH=%5E6_1280x703; BDqhfp=%E9%A3%9E%E6%9C%BA%26%26NaN-1undefined%26%260%26%261; __cfduid=dd82f58321cf57b3e90b6ecd8088431b41508419891; BDUSS=RPbGdYN2ZMUnZadjM3NlVXSkRqbVhZM28ybWgwOWxzU2F4YWFtdk5VOXlQdnRaSVFBQUFBJCQAAAAAAAAAAAEAAACkjbQ6utrU88GvbzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHKx01lysdNZcH; BAIDUID=4B4C79AED13EE3DC1C819ED095B7C1B8:FG=1; PSTM=1508937405; BIDUPSID=DE6DCA159E2BDECDF1472B04F122B738; pgv_pvi=686790656; pgv_si=s7669567488; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=3; H_PS_PSSID=1424_21092_22158; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; indexPageSugList=%5B%22%E9%A3%9E%E6%9C%BA%22%2C%22work%2B%20%E5%8A%9E%E5%85%AC%E5%AE%A4%20%E6%88%90%E9%83%BD%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%20%E5%AE%A4%E5%86%85%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%2010%20%E5%8A%9E%E5%85%AC%E5%AE%A4%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%2010%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%22%2C%22%E7%A6%8F%22%5D; cleanHistoryStatus=0; userFrom=www.baidu.com", | |
"DNT":"1", | |
"Host":"image.baidu.com", | |
"Referer":"https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1510847661183_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%A3%9E%E6%9C%BA", | |
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", | |
"X-Requested-With":"XMLHttpRequest" | |
} | |
url = "https://image.baidu.com/search/acjson" | |
payload = {"tn":"resultjson_com", | |
"ipn":"rj", | |
"ct":201326592, | |
"is": "", | |
"fp":"result", | |
"queryWord":keyword, | |
"cl":2, | |
"lm":-1, | |
"ie":"utf-8", | |
"oe":"utf-8", | |
"adpicid": "", | |
"st":-1, | |
"z": "", | |
"ic":0, | |
"word":keyword, | |
"s": "", | |
"se": "", | |
"tab": "", | |
"width": "", | |
"height": "", | |
"face":0, | |
"istype":2, | |
"qc":"", | |
"nc":1, | |
"fr":"", | |
"pn":0, | |
"rn":rn, | |
"gsm":"1e", | |
str(int(time.time()* 1000)): ""} | |
def download_img(url, title, type_name): | |
dir_name = "d" | |
if not os.path.isdir(dir_name): | |
os.mkdir(dir_name) | |
fn = hashlib.md5(url).hexdigest() + "_" + title + "." + type_name | |
fpath = os.path.join(dir_name, fn) | |
if not os.path.isfile(fpath): | |
r = requests.get(url) | |
with open(fpath, 'wb') as fd: | |
for chunk in r.iter_content(100*1024): | |
fd.write(chunk) | |
else: | |
print 'already exists: ', fpath | |
def crawl_baidu(): | |
r = requests.get(url, params=payload, headers=headers) | |
if 300 > r.status_code >= 200: | |
r.encoding = 'utf8' | |
obj = json.loads(r.text) | |
for i in obj["data"]: | |
print "原始图片名字:", i.get("fromPageTitle", "无") | |
print "图片名字:", i.get("fromPageTitleEnc", "无") | |
print "缩略图地址:", i.get("thumbURL", "无") | |
print "图片地址:", i.get("middleURL", "无") | |
print "-" * 45 | |
if i.get("thumbURL", ""): | |
print "start download img" | |
t = time.time() | |
download_img(i.get("thumbURL"), i.get("fromPageTitleEnc", "无"), i.get("type", "jpg")) | |
print 'down load: ', i.get("thumbURL"), " spent:", time.time() - t, "s" | |
print "-" * 45 | |
def main(): | |
for i in range(page_count): | |
payload['pn'] = (page_num + i) * rn, | |
print '下载第 %d 页' % (i + 1) | |
crawl_baidu() | |
print '下载完成' | |
print '*' * 80 | |
time.sleep(1) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment