puppeteer(木偶人爬虫)
最近在公司大佬的带领下了解了一下谷歌的木偶人,发现JS无所不能啊。@(乖)
直接上菜吧(获取bing页面的壁纸到本地)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
| const puppeteer = require('puppeteer'); const https = require('https'); const fs = require('fs');
(async ()=> { const browse = await puppeteer.launch({ headless: false, devtools: false }).catch(() => browse.close) const page = await browse.newPage() await page.setViewport({width:1280,height:800}) await page.setRequestInterception(true) page.on('request', request => { if (request.resourceType() === 'image') { downLoadAndSave(request.url()) request.continue() } else{ console.log("continue") request.continue() } }) await page.goto('https://bing.ioliu.cn/') await autoScroll(page) })()
async function autoScroll(page) { console.log('auto scroll start') await page.evaluate(async () => { await new Promise((resolve, reject) => { let totalHeight = 0 let distance = 100 let timer = setInterval(() => { let scrollHeight = document.body.scrollHeight window.scrollBy(0, distance) totalHeight += distance if(totalHeight >= scrollHeight) { clearInterval(timer) resolve() } }, 200) }) }) console.log('auto scroll done') }
function downLoadAndSave(url) { https.get(url, (req, res) => { let imgData = '' req.setEncoding('binary') req.on('data', chunk => { imgData += chunk }) req.on('end', data => { let time = new Date().getTime() fs.writeFile(`image/img_${time}.jpeg`, imgData, 'binary', err => { if(err){ console.log('保存出错'+err) }else{ } }) }) }) }
|
package.json
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| { "name": "MuourenTest", "version": "1.0.0", "description": "木偶人测试", "main": "index.js", "scripts": { "test:jest": "jest" }, "author": "xiaohuwei", "license": "ISC", "devDependencies": { "chromedriver": "^75.1.0", "nightwatch": "^1.1.13" }, "dependencies": { "puppeteer": "^1.18.1" } }
|
执行流程和效果
![]()
![]()
PS:需要在同级目录新建 image
文件夹哦