Skip to content

Instantly share code, notes, and snippets.

@larry1001
Created March 7, 2019 02:08
Show Gist options
  • Save larry1001/5a23678482b4a868981ce5c9f6cd64a5 to your computer and use it in GitHub Desktop.
Save larry1001/5a23678482b4a868981ce5c9f6cd64a5 to your computer and use it in GitHub Desktop.
puppeteer 抓取新浪微博
const puppeteer = require("puppeteer");
const qs = require('querystringify');
const userName = "XXXX";
const passWord = "XXXX";
const sleep = require("./utils").sleep;
const log = require('log4js').getLogger("weibo");
const weiboconfig = require("./config").weibo;
log.level = 'info';
const index = async function (nickname) {
// 下面异步相当于打开浏览器
const browser = await puppeteer.launch({
headless: false,
args: [
"--window-size=1360,768"
]
});
// 开一个新的页签,准备抓取微博数据
const pageWeiboUser = await browser.newPage();
await pageWeiboUser.setViewport({
width: 1360,
height: 768
});
//开启控制台输出
pageWeiboUser.on('console', msg => console.log(msg.text()));
pageWeiboUser.on('request', (interceptedRequest) => {
const url = interceptedRequest.url();
if(!/$[jpg|png|css]/g.test(url)) {
console.log('A request url was made:', interceptedRequest.url());
}
// console.log('A request headers was made:', interceptedRequest.headers());
// console.log('A request post was made:', interceptedRequest.postData());
});
pageWeiboUser.on('response', (res) => {
// console.log('res headers was ', res.headers());
});
await pageWeiboUser.exposeFunction('wbconfig', async () => {
return new Promise((resolve, reject) => {
resolve(weiboconfig);
});
});
const login = async function () {
log.info("开始登录");
await pageWeiboUser.goto("https://www.weibo.com", {
timeout: 0
});
// 等待浏览器加载完毕
await pageWeiboUser.waitForNavigation({
waitUntil: ["load"],
timeout: 0
});
// 模拟输入用户名
console.log("输入用户名...");
await pageWeiboUser.waitForSelector("#loginname");
await pageWeiboUser.focus("#loginname");
await pageWeiboUser.keyboard.type(userName, {
delay: 10
});
// 模拟输入密码
console.log("输入密码...");
// 等待浏览器中出现元素`input[name=password]`
await pageWeiboUser.waitForSelector("input[name=password]");
await pageWeiboUser.focus("input[name=password]");
await pageWeiboUser.keyboard.type(passWord, {
delay: 10
});
// 模拟点击登录
log.info("登录中...");
await pageWeiboUser.click("a[action-type=btn_submit]", {
delay: 500
});
await pageWeiboUser.waitForNavigation({
waitUntil: ["load"],
timeout: 0
});
log.info("登录完成");
};
/**
* 滚动直到分页bar出来
* @return {[type]} [description]
*/
let scrollToPageBar = async function () {
let pageBar = await pageWeiboUser.$("div[node-type=feed_list_page]");
while (!pageBar) {
// 传递命令给浏览器,让浏览器执行滚动
await pageWeiboUser.evaluate((scrollStep) => {
let scrollTop = document.scrollingElement.scrollTop;
document.scrollingElement.scrollTop = scrollTop + scrollStep;
}, 1000);
await sleep(2000);
pageBar = await pageWeiboUser.$("div[node-type=feed_list_page]")
}
};
/**
* 点击下一页面按钮
* @return {[type]} [description]
*/
let gotoNextPage = async function (pageNum) {
await pageWeiboUser.goto("https://weibo.com/" + nickname + "?is_search=0&visible=0&is_ori=1&is_tag=0&profile_ftype=1&page=" + pageNum + "#feedtop");
await pageWeiboUser.addScriptTag({
url: "https://cdn.bootcss.com/jquery/3.3.1/jquery.min.js"
});
};
/**
* 获取带抓取微博的总页数
* @return {[type]} [description]
*/
let getTotalPage = async function () {
await scrollToPageBar();
// 发送命令获取总页数
let pageInfo = await pageWeiboUser.evaluate(() => {
let pageMore = $("div[node-type=feed_list_page] div > span > a");
let pageInfo = pageMore.attr("action-data");
return pageInfo;
});
let pageInfoObj = qs.parse(pageInfo);
return pageInfoObj.countPage;
};
/**
* 抓取当前页面的微博
* @return {[type]} [description]
*/
let getWeiboContent = async function (pageNum) {
await scrollToPageBar();
await pageWeiboUser.evaluate(() => {
document.scrollingElement.scrollTop = 300;
});
// 获取微博个数
let count = await pageWeiboUser.evaluate(() => {
return $("div[action-type=feed_list_item]").length;
});
log.info("weibo count " + count);
const wc = await pageWeiboUser.evaluate(async () => {
let weiboes = [...$("div[action-type=feed_list_item]")];
return weiboes.map(weibo => {
console.log($(weibo).html());
return {
weiboId: $.trim($(weibo).attr("mid")),
content: $.trim($(weibo).find("div[node-type=feed_list_content]").text()),
create_time: $.trim($(weibo).find("[node-type=feed_list_item_date]").attr("title")),
weibo_url: window.wbconfig.domain + $(weibo).find("[node-type=feed_list_item_date]").attr("href"),
repost_num: $(weibo).find("[action-type=fl_forward] em:eq(1)").text()
}
});
});
for (we of wc) {
console.log(we);
}
// let weibo = await pageWeiboUser.$("#spider_" + i);
// await weibo.screenshot({
// path: "./screenshots/" + pageNum + "_" + (i + 1) + ".png"
// });
process.stdout.write(".");
await sleep(50);
process.stdout.write("\n");
};
await login();
let pageNum = 1;
await pageWeiboUser.goto("https://weibo.com/" + nickname + "?profile_ftype=1&is_ori=1");
await pageWeiboUser.waitFor("div#plc_frame");
await pageWeiboUser.addScriptTag({
url: "https://cdn.bootcss.com/jquery/3.3.1/jquery.min.js"
});
let countPage = await getTotalPage();
while (countPage >= pageNum) {
console.log("开始抓取第[" + pageNum + "]页数据...");
await getWeiboContent(pageNum);
console.log("第[" + pageNum + "]页数据抓取结束");
pageNum++;
await gotoNextPage(pageNum);
}
console.log("\n\n抓取结束");
await browser.close();
};
index("bbshefeicc");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment