目录
  1. 1. puppeteer(木偶人爬虫)
    1. 1.0.1. 直接上菜吧(获取bing页面的壁纸到本地)
      1. 1.0.1.1. package.json
    2. 1.0.2. 执行流程和效果
优雅的使用木偶人实现图片爬虫

puppeteer(木偶人爬虫)

最近在公司大佬的带领下了解了一下谷歌的木偶人,发现JS无所不能啊。@(乖)

直接上菜吧(获取bing页面的壁纸到本地)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
const puppeteer = require('puppeteer');
const https = require('https');
const fs = require('fs');
//创建浏览器对象
(async ()=> {
const browse = await puppeteer.launch({
headless: false,
devtools: false
}).catch(() => browse.close)
const page = await browse.newPage()
await page.setViewport({width:1280,height:800})
await page.setRequestInterception(true)
page.on('request', request => {
//监控页面所有类型为image的请求并取得其地址
if (request.resourceType() === 'image') {
downLoadAndSave(request.url())
request.continue()
} else{
console.log("continue")
request.continue()
}
})
await page.goto('https://bing.ioliu.cn/')
await autoScroll(page)
})()
//模拟真人滑动网页
async function autoScroll(page) {
console.log('auto scroll start')
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
let totalHeight = 0
let distance = 100
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight
window.scrollBy(0, distance)
totalHeight += distance
if(totalHeight >= scrollHeight) {
clearInterval(timer)
resolve()
}
}, 200)
})
})
console.log('auto scroll done')
}
//保存文件
function downLoadAndSave(url) {
https.get(url, (req, res) => {
let imgData = ''
req.setEncoding('binary')
req.on('data', chunk => {
imgData += chunk
})
req.on('end', data => {
let time = new Date().getTime()
fs.writeFile(`image/img_${time}.jpeg`, imgData, 'binary', err => {
if(err){
console.log('保存出错'+err)
}else{
// console.log('保存成功')
}
})
})
})
}

package.json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
{
"name": "MuourenTest",
"version": "1.0.0",
"description": "木偶人测试",
"main": "index.js",
"scripts": {
"test:jest": "jest"
},
"author": "xiaohuwei",
"license": "ISC",
"devDependencies": {
"chromedriver": "^75.1.0",
"nightwatch": "^1.1.13"
},
"dependencies": {
"puppeteer": "^1.18.1"
}
}

执行流程和效果

1
2
npm i
node request.js

PS:需要在同级目录新建 image 文件夹哦

文章作者: 肖虎威
文章链接: https://xiaohuwei.github.io/2019/11/01/%E4%BC%98%E9%9B%85%E7%9A%84%E4%BD%BF%E7%94%A8%E6%9C%A8%E5%81%B6%E4%BA%BA%E5%AE%9E%E7%8E%B0%E5%9B%BE%E7%89%87%E7%88%AC%E8%99%AB/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 肖虎威博客

评论