千家信息网

使用chrome-har导出浏览器HAR数据

发表于:2025-01-25 作者:千家信息网编辑
千家信息网最后更新 2025年01月25日,这里使用nodejs下的chrome-har库来导出浏览器的har数据,经验证效果不错,比较靠谱。1,创建日志配置(ultra-harlog/module/log.js)//cnpm install
千家信息网最后更新 2025年01月25日使用chrome-har导出浏览器HAR数据

这里使用nodejs下的chrome-har库来导出浏览器的har数据,经验证效果不错,比较靠谱。

1,创建日志配置(ultra-harlog/module/log.js)

//cnpm install --save log4jsconst log4js = require('log4js');const options = {    appenders:{        console:{            type: "console"        },        "puppeteer-record":{            type : 'dateFile',            filename : 'logs/puppeteer/log',      pattern : '-yyyy-MM-dd.log',      alwaysIncludePattern : true,      encoding : 'utf-8'        },        "puppeteer-har-record":{            type : 'dateFile',            filename : 'logs/puppeteerhar/log',      pattern : '-yyyy-MM-dd.log',      alwaysIncludePattern : true,      encoding : 'utf-8'        },        "puppeteer-harevent-record":{            type : 'dateFile',            filename : 'logs/puppeteerharevent/log',      pattern : '-yyyy-MM-dd.log',      alwaysIncludePattern : true,      encoding : 'utf-8'        }    }   ,    "categories": {    "default": { "appenders": ['console', "puppeteer-record", "puppeteer-har-record","puppeteer-harevent-record"], "level": "all" }  }}log4js.configure(options);function getConsoleLogger(){    let consoleLog = log4js.getLogger('console');       return consoleLog ;}function getPuppeteerRecordLogger(){    let consoleLog = log4js.getLogger('puppeteer-record');      return consoleLog ;}function getPuppeteerHarRecordLogger(){    let consoleLog = log4js.getLogger('puppeteer-har-record');      return consoleLog ;}function getPuppeteerHarEventRecordLogger(){    let consoleLog = log4js.getLogger('puppeteer-harevent-record');     return consoleLog ;}exports.getConsoleLogger = getConsoleLogger;exports.getPuppeteerRecordLogger = getPuppeteerRecordLogger;exports.getPuppeteerHarRecordLogger = getPuppeteerHarRecordLogger;exports.getPuppeteerHarEventRecordLogger = getPuppeteerHarEventRecordLogger;

创建抓取的代码(harlog/module/puppeteerhar.js)

const puppeteer = require('puppeteer');const PuppeteerHar = require('puppeteer-har');const path = require("path");const logger=require("./log");const grpcclient=require("./grpcclient");const log = logger.getPuppeteerHarRecordLogger() ;/*    启动浏览器*/ async function launchBrowser(){    //启动浏览器实例 [puppeteer.createBrowserFetcher([options])]  let browser = await puppeteer.launch({    // 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/    //executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',    //如果是访问https页面 此属性会忽略https错误    ignoreHTTPSErrors: true,    // 关闭headless模式, 不会打开浏览器    headless: true,    //浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/   --timeout    args:['--disk-cache-size=0','--disable-cache','--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],    //是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。    devtools: false,    //Defaults to 30000 (30 seconds). Pass 0 to disable timeout.    timeout: 0    //放慢puppeteer执行的动作,方便调试    //slowMo: 250  });  return browser ;}async function saveHarlog(url,dirPath,filename){    let homesite = url ;    //保存的文件路径    let harFilePath = path.join(dirPath,filename) ;    //处理URL    if(!(url.startsWith('http://') || url.startsWith('https://'))){        url = "http://" + url ;    }  //打开浏览器  let browser = await launchBrowser() ;  //Puppeteer 初始化的屏幕大小默认为 800px x 600px。但是这个尺寸可以通过 Page.setViewport() 设置。  /*  await page.setViewport({        width: 800,        height: 600  });  */  //创建一个新页面  //let page = await browser.newPage();  const page = (await browser.pages())[0];  await page.waitFor(1000); //delay 1 s  //page.setDefaultTimeout(12000);  //page.setJavaScriptEnabled(enabled)  //事件监听轻松打出页面的log  //page.on('console', msg => log.info('PAGE LOG:', msg.text()));  let har = new PuppeteerHar(page);  try{    await har.start({ path:harFilePath});    /*        页面跳转相关函数:        page.goto(url, options)  //相当于在浏览器中输入了地址,然后回车        page.goBack(options)        page.goForward(options)        page.reload(options)    */    await page.goto(url,{            timeout:0    });    log.info(page.mainFrame().title());    log.info(page.mainFrame().url());    //返回HTML文档内容    //const html = await page.$eval('html', e => e.outerHTML);    //const html = await page.content() ;    //通知JAVA解析HAR文件        /*      try{        grpcclient.resovleHarLog({                url:homesite,                file_name:filename,                file_dir:dirPath,                context:''            });      }catch(err){        log.error('发送RPC请求失败,' + err);      }        */  }catch(error){    log.info('resovle error :' + url + ";  error message:" + error) ;  }finally{    if(har){        await har.stop();           }    if(browser){        await browser.close();          }  } }exports.launchBrowser = launchBrowser;exports.saveHarlog = saveHarlog;

创建启动文件(ultra-harlog/puppeteerhar-app.js)

const fs = require("fs");const path = require("path");const moment = require("moment");const schedule = require('node-schedule');const cvsresovler=require("./module/cvsresovle");const mhar=require("./module/puppeteerhar");/*cnpm install --save momentcnpm install --save csvcnpm install --save node-schedulecnpm install --save puppeteercnpm install --save puppeteer-harcnpm install --save iconv-litecnpm install --save chrome-harcnpm install --save grpc*/  function init(){        console.log('初始化调度器') ;    //每分钟的第30秒定时执行一次:    schedule.scheduleJob('0 14 10 * * *',()=>{        let ftime = moment().format('YYYYMMDDHHmm');        console.log('当前调度时间为:' + ftime) ;        let dirPath = path.join(__dirname,'harlogs',ftime) ;        console.log("创建目录:" + dirPath) ;        let isExist = false ;        if(fs.existsSync(dirPath)){                //创建文件夹                let stat = fs.lstatSync(dirPath);                if(stat.isDirectory()){                    isExist = true ;                }        }        if(!isExist){            //创建文件夹            console.log("创建文件夹" + ftime) ;                fs.mkdirSync(dirPath);        }               //开始解析需要处理的URL        let dataArr = cvsresovler.readUrlRecord(path.join(__dirname,'top300.csv')) ;        console.log("解析出URL共计" + dataArr.length + "条") ;        /*            开始抓取HAR数据【同步的方式执行】。            注意:如果这里直接通过for循环遍历dataArr并调用saveHarlog方法,那么这将是一个异步的过程。        */            (async function iterator(i){                    let data =  dataArr[i]                    let url = data['SITE_LINK'] ;                url = url.trim() ;                let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;                if(url){                            console.log((i+1) + "-starting to resovle url :" + url ) ;                    try{                                await mhar.saveHarlog(url,dirPath,"N" + "-" + filename) ;                            }catch(error){                                console.log(error) ;                            }                }                if(i + 1 < dataArr.length){                    iterator(i+1) ;                }            })(0) ;    });     console.log('应用程序启动完成') ;}//执行//init();/**    用于测试的方法*/async function test(){        let ftime = moment().format('YYYYMMDDHHmm');    console.log('当前执行时间为:' + ftime) ;    let dirPath = path.join(__dirname,'harlogs',ftime) ;    console.log("创建目录:" + dirPath) ;    let isExist = false ;    if(fs.existsSync(dirPath)){            //创建文件夹            let stat = fs.lstatSync(dirPath);            if(stat.isDirectory()){                isExist = true ;            }    }    if(!isExist){        //创建文件夹        console.log("创建文件夹" + ftime) ;          fs.mkdirSync(dirPath);    }         //测试的URL        let url = "www.baidu.com" ;         let arguments = process.argv.splice(2);        if(arguments.length > 0 ){            url = arguments[0] ;        }        url = url.trim() ;        let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;        if(url){            console.log("starting to resovle test url :" + url ) ;            try{                await mhar.saveHarlog(url,dirPath,"NT" + "-" + filename) ;            }catch(error){                console.log(error) ;            }        }}//运行测试test() ;

关于GRPC部分的代码,请参考我另外一篇博文

参考地址:https://michaljanaszek.com/blog/generate-har-with-puppeteer

0