chromedp网络监听_动态爬虫三：监听网络事件 + 监听js事件-白红宇的个人博客

发布日期：2021-06-24 17:13:27 浏览次数：2 分类：技术文章

本文共 7535 字，大约阅读时间需要 25 分钟。

一：概述

上两篇文章介绍了cdp协议和chromedp库，从这篇文章开始动手实战一下，我们要拿到页面上更多的网络请求，最直接的想法就是类似于开发者工具里的network，只有一有网络请求就显示在列表里(在network里隐藏的网络请求后续讨论)，完后页面加载完后就可以浏览页面操作页面了，比如点击，滑动等等，这些都是 js 的事件，如果能把所有绑定了事件的元素都找到，再去触发它，那么网络请求那边就能监听到。

二：小试牛刀

这是开发者工具的网络窗口：

思路有了接下来我们就是要找到一个合适的事件，打开协议监控(参考上几篇文章)，完后刷新页面，完后搜索 network 就会看到和网络相关的事件了(network是来源于cdp协议文档里)

从这个字面理解网络请求即将发送，很适合啊，就它了。

顺便看一下其他事件，下面这个很适合采集响应

使用之前搭建的框架试一下，代码结构如下：

项目结构等完后单独写篇文章讨论，上面提到的逻辑都写在 scan.go 中了，总体代码如下：

func run(cmd *cobra.Command, args []string) {
       sugar := logger.Sugar()   var ws sync.WaitGroup   listRequest := list.New()   listResponse := list.New()   options := append(chromedp.DefaultExecAllocatorOptions[:],      chromedp.Flag("headless", true),      chromedp.Flag("disable-gpu", false),      chromedp.Flag("disable-extensions", true),      chromedp.Flag("hide-scrollbars", false),      chromedp.Flag("mute-audio", false),      chromedp.Flag("enable-automation", false),      chromedp.UserAgent(`Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36`),   )   defaultCtx, cancel := chromedp.NewExecAllocator(context.Background(), options...)   defer cancel()   withLogCtx, cancel := chromedp.NewContext(defaultCtx, chromedp.WithLogf(log.Printf))   defer cancel()   timeoutCtx, cancel := context.WithTimeout(withLogCtx, time.Duration(viper.GetInt64("timeout")) *time.Second)   defer cancel()   if err := chromedp.Run(timeoutCtx, make([]chromedp.Action, 0, 1)...); err != nil {
          sugar.Fatal("create chrome error", err)   }   sugar.Info("Chrome instance create success. \n")   chromedp.ListenTarget(timeoutCtx, func(event interface{}) {
          switch event.(type) {
          case *network.EventRequestWillBeSent:         ws.Add(1)         go func(req *network.EventRequestWillBeSent) {
                harReq := common.ProcessRequest(req)            rm := map[network.RequestID]*har.Request{}            rm[req.RequestID] = harReq            listRequest.PushBack(rm)            ws.Done()         }(event.(*network.EventRequestWillBeSent))      case *network.EventResponseReceived:         ws.Add(1)         go func(resp *network.EventResponseReceived) {
                harResp := common.ProcessResponse(resp)            rm := map[network.RequestID]*har.Response{}            rm[resp.RequestID] = harResp            listResponse.PushBack(rm)            ws.Done()         }(event.(*network.EventResponseReceived))      case *network.EventDataReceived:         // TODO::      }   })   if err := chromedp.Run(timeoutCtx, network.Enable()); err != nil {
          panic(err)   }   if err:= chromedp.Run(timeoutCtx, dom.Enable()); err != nil {
          panic(err)   }   if err:= chromedp.Run(timeoutCtx, chromedp.Navigate(args[0])); err != nil {
          panic(err)   }   if err := chromedp.Run(timeoutCtx, common.WalkAllNode()); err != nil {
          panic(err)   }   ws.Wait()}

创建浏览器实例部分之前解释过，这里就不多说了，我们现在创建了一个浏览器实例，得为我们的浏览器设置网络监听处理，就是下面这段

chromedp.ListenTarget(timeoutCtx, func(event interface{}) {
          switch event.(type) {
          case *network.EventRequestWillBeSent:   发请求前监听的事件         ws.Add(1)         go func(req *network.EventRequestWillBeSent) {
                harReq := common.ProcessRequest(req)            rm := map[network.RequestID]*har.Request{}            rm[req.RequestID] = harReq            listRequest.PushBack(rm)            ws.Done()         }(event.(*network.EventRequestWillBeSent))      case *network.EventResponseReceived:  收到响应监听的事件         ws.Add(1)         go func(resp *network.EventResponseReceived) {
                harResp := common.ProcessResponse(resp)            rm := map[network.RequestID]*har.Response{}            rm[resp.RequestID] = harResp            listResponse.PushBack(rm)            ws.Done()         }(event.(*network.EventResponseReceived))      case *network.EventDataReceived:         // TODO::      }   })

这些事件最好都在协程里处理，同时注意要开启网络监听，我弄那会儿就折腾了半天抓不到流量，仔细看文档才发现的

if err := chromedp.Run(timeoutCtx, network.Enable()); err != nil {
       panic(err)}

中间插一个介绍，har 这个东西，我之前都不知道，翻源码发现的，就是专门记录http流量的一个格式

维基百科介绍： https://zh.wikipedia.org/wiki/.har

反正现在还不知道存成啥样子呢，先按这个格式来吧，上面写了2个处理函数，代码如下：

func ProcessRequest(r *network.EventRequestWillBeSent) *har.Request {
       req := har.Request{}   req.Method = r.Request.Method   req.URL = r.Request.URL   req.Headers = []*har.NameValuePair{}   for header := range r.Request.Headers {
          h := har.NameValuePair{}      h.Name = header      h.Value = r.Request.Headers[header].(string)      req.Headers = append(req.Headers, &h)   }   req.Cookies = []*har.Cookie{}   req.QueryString = []*har.NameValuePair{}   u, err := url.Parse(req.URL)   if err != nil {
          log.Printf("[E] Invalid URL data recived : %v", err)   }   for name := range u.Query() {
          if len(name) != 0 {
             values := u.Query()[name]         for _, val := range values {
                req.QueryString = append(req.QueryString, &har.NameValuePair{
                   Name:  name,               Value: val,            })         }      }   }   if req.Method == "POST" {
          // Process the post data of the form *har.PostData   }   return &req}func ProcessResponse(r *network.EventResponseReceived) *har.Response {
       res := har.Response{}   res.Status = r.Response.Status   res.StatusText = r.Response.StatusText   res.HTTPVersion = r.Response.Protocol   // TODO : implement cookie information.   res.Cookies = nil   res.Headers = []*har.NameValuePair{}   // headers from the *network.EventRequestWillBeSent are in the form,   // map[key:value]. this needs to be converted to the form of a   // har.NameValuePair   for header := range r.Response.Headers {
          h := har.NameValuePair{}      h.Name = header      h.Value = r.Response.Headers[header].(string)      res.Headers = append(res.Headers, &h)   }   // response content   res.Content = &har.Content{}   res.Content.MimeType = r.Response.MimeType   res.Content.Size = 0   // Redirect URL   res.RedirectURL = ""   res.HeadersSize = 0   res.BodySize = 0   return &res}

好了，跑一下程序试试效果

看着还可以，哈哈哈，接下来该到查看所有带事件的节点了，我们知道 html 最后会解析为一棵 DOM 树，那么怎么获取这个树呢，还是翻文档，发现个这个，如下：

把所有节点打平到一个数组里，正是我们想要的。数组里是 Node 结构，如下：

翻了一下发现好像没事件相关的东西，这个地方折腾了好一会儿，还是看文档，如下：

事件相关的在 DOMDebugger 中，这个方法可以获取所有事件监听。但是这个函数要的参数是一个 RemoteObjectID。

我擦，此时陷入了迷茫，我有的只有一个 nodeID，怎么获取 RemoteObjectId ？？？

功夫不负有心人，找到一篇文章

(链接：https://stackoverflow.com/questions/31280653/what-is-nodeid-in-chrome-extension-chrome-debugger)

搞清楚后，还是回到文档，开始找找转换函数，于是找到了这个货

胜利就在前方，上代码

func WalkAllNode() chromedp.Tasks {
       return chromedp.Tasks{
          chromedp.ActionFunc(func(ctx context.Context) error {
             node, err := dom.GetFlattenedDocument().Do(ctx)         if err != nil {
                return err         }         for _, tmpNode := range node {
                rmObject, _ := dom.ResolveNode().WithNodeID(tmpNode.NodeID).Do(ctx)            eventLister, _ := domdebugger.GetEventListeners(rmObject.ObjectID).Do(ctx)            if len(eventLister) > 0 {
                   fmt.Println(tmpNode.NodeName, tmpNode.Attributes)               for _, event := range eventLister {
                      fmt.Println(event.Type)               }            }         }         return nil      }),   }}

调用的地方在这里

...if err:= chromedp.Run(timeoutCtx, chromedp.Navigate(args[0])); err != nil {
       panic(err)}if err := chromedp.Run(timeoutCtx, common.WalkAllNode()); err != nil {
       panic(err)}ws.Wait()...

跑一下，效果如下