基于puppeteer的网页截图应用之超长网页截图的思考与探索

您所在的位置:网站首页 网页长截图捷径 基于puppeteer的网页截图应用之超长网页截图的思考与探索

基于puppeteer的网页截图应用之超长网页截图的思考与探索

2024-06-24 21:20| 来源: 网络整理| 查看: 265

前言

某日,由于需要对某些网页进行截图存储,于是使用nodejs+puppeteer的方案来实现,但在实验截图效果时发现一些超长的网页无法得到正确结果,由此在处理超长网页截图的调整上历经磨难。

快速实现12345678910111213141516171819202122const puppeteer = require('puppeteer')(async () => { const browser = await puppeteer.launch({ args: [ '--disable-gpu', '--disable-dev-shm-usage', '--disable-setuid-sandbox', '--no-first-run', '--no-zygote', '--no-sandbox' ], timeout: 0, pipe: true, headless: true, ignoreHTTPSErrors: true, // executablePath: ChromiumPath }); const page = await browser.newPage() await page.goto('https://karoy.cn/') await page.screenshot({path: 'karoy.png'}) await browser.close()})() 尽量完成网页渲染12345// 500ms内请求数为0await page.goto('https://karoy.cn/', {waitUntil: 'networkidle0'})// 等待1000msawait page.waitFor(1000) 截取网页全图12345await page.screenshot({ path: 'karoy.png', type:'png', fullPage:true}) 超长网页截取全图失败

这个时候在遇到超长网页的时候就开始失败了,一般表现为图片后部存在大片的空白。具体原因由于Chromium硬性实现,在网页高度达到16384px以上,将不再处理后续内容。

方案一:多次截图后拼接123456789101112131415161718192021222324252627282930313233343536373839//建议先通过自动滚屏方式,让网页完全加载后再取得相应数据let {pageHeight,viewport} = await page.evaluate(() => {window.scrollTo(0, 0);return { pageHeight: document.body.scrollHeight, viewport: { height: document.body.clientHeight, width: document.body.clientWidth }};});let viewHeight = viewport.heightlet viewWidth = viewport.widthlet maxViewHeight = 16000;let partViewCount = Math.ceil(pageHeight / maxViewHeight);let lastViewHeight = pageHeight - ((partViewCount - 1) * maxViewHeight);let images = []for (let i = 1; i { $('body').css('margin-top', '-' + totalMarignTop + 'px') var timer = setTimeout(() => { clearTimeout(timer); resolve(); }, 1000); }) }, totalMarignTop)}// 合并图片最终的超长网页全图生成就基于上面所展示的核心代码来完成处理的。

合并图片123456789if (partViewCount == 1) { let img = await Jimp.read(images[0]) img.write(filename)} else { let img = await mergeImg(images, { direction: true }) img.write(filename)} 完整方法123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138// utils.js 部分const Jimp = require('jimp')const mergeImg = require('merge-img')var pageScreenshot = async function(page, filename) { let { pageHeight, viewport } = await page.evaluate(() => { window.scrollTo(0, 0); return { pageHeight: document.body.scrollHeight, viewport: { height: document.body.clientHeight, width: document.body.clientWidth } }; }); let viewHeight = viewport.height let viewWidth = viewport.width let maxViewHeight = viewHeight; let partViewCount = Math.ceil(pageHeight / maxViewHeight); let lastViewHeight = pageHeight - ((partViewCount - 1) * maxViewHeight); let totalMarignTop = 0 let images = [] for (let i = 1; i { return new Promise((resolve, reject) => { $('body').css('margin-top', '-' + totalMarignTop + 'px') var timer = setTimeout(() => { clearTimeout(timer); resolve(); }, 1000); }) }, totalMarignTop) } return new Promise(async (resolve, reject) => { if (partViewCount == 1) { let img = await Jimp.read(images[0]) img.write(filename) } else { let img = await mergeImg(images, { direction: true }) img.write(filename) } resolve() })}var autoScroll = function(page) { return page.evaluate(() => { return new Promise((resolve, reject) => { var totalHeight = 0; var distance = 100; var timer = setInterval(() => { var scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(totalHeight); } }, 100); }) });}module.exports = { pageScreenshot, autoScroll}// app.js 部分const puppeteer = require('puppeteer')const { pageScreenshot, autoScroll } = require('./utils')//// 使用demo(async () => {//...//...let browser = await puppeteer.launch({ args: [ '--disable-gpu', '--disable-dev-shm-usage', '--disable-setuid-sandbox', '--no-first-run', '--no-zygote', '--no-sandbox' ], timeout: 0, pipe: true, headless: true, ignoreHTTPSErrors: true, // executablePath: ChromiumPath, defaultViewport: null })let page = await browser.newPage()await page.setViewport({ width: 1000, height: 1920, deviceScaleFactor: 1 })await page.goto('https://karoy.cn/', { timeout: 30000, waitUntil: ['networkidle0'] })await autoScroll(page)await page.evaluate(() => { window.scrollTo(0, 0) })await page.waitFor(500)await pageScreenshot(page, 'title.png').catch(err => console.log(err))//...//...})() 结束

虽然最终的解决关键很简单,但在遇到问题的时候,尝试过很多方式,包括重新安装依赖,多次改变参数选项,搜索相关资料等。最后另辟蹊径来解决了本次遇到的问题。



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3