使用PyV8解析HTML文档

1803次阅读  |  发布于5年以前

什么是PyV8?

PyV8是一个Python封装V8引擎的壳。它提供了简单可用的API,能够利用python来构建出JavaScript的运行时环境。

PyV8能用来干什么?

在nodejs火热流行的时代,或许很少人关注这个基于python简单封装的v8引擎。在某些方面,它比nodejs简洁,而它们拥有同样的本质基础,使得它具有和nodejs相似的潜力。

既然是基于v8的,那么利用它来解析dom和执行javascript是理所当然的。试想一下,如果我们能够建立一个系统把html文档结构解析成w3c dom树,那么我们就可以在上面运行任何javascript,使得这个系统事实上成为一个浏览器(尽管缺少渲染和显示的部分)。

不过,pyv8提供的是纯js运行环境,因此,要解析html文档,还需要为它构建出w3c dom以及浏览器的环境。实现dom和浏览器接口毫无疑问是件力气活,幸运的是,pyv8的官方demo中为我们做了80%以上的工作,提供了w3c.py和browser.py的支持!我们所要做的是在这基础上做一些基本的修改。

好吧,让我们来实践一下:

首先我们构建一个基本的js运行环境,参考commonjs的api实现js基础的模块加载和执行。这一部分还不涉及dom和浏览器,基本上是js原生环境。

import os, re, platform
    from PyV8 import JSContext, JSError

    from logger import logger

    class CommonJS():
        _js_path = [os.path.dirname(__file__), os.path.join(os.path.dirname(__file__),"core")]
        _js_logger = logger().instance()

        def __init__(self):
            self._js_threadlock = False
            self._js_ctx = JSContext(self)
            self._js_modules = {}
            self._loaded_modules = {}

            for jsroot in CommonJS._js_path:
                for (root, dirs, files) in os.walk(jsroot):
                    for _file in files:
                        m = re.compile("(.*)\.js$").match(_file)
                        relpath = os.path.relpath(root, jsroot)
                        namespace = re.sub(r"^\.", "", relpath)
                        namespace = re.sub(r"^\\", "/", namespace)
                        if(namespace):
                            namespace = namespace + "/"
                        if(m):
                            self._js_modules.update({namespace + m.group(1) : os.path.join(root,_file)})

            self.execute("var exports;");               

        @classmethod
        def append(path):
            if(path not in CommonJS._js_path):
                CommonJS._js_path.append(path)

        def require(self, module):
            if(not self._js_modules.has_key(module)):
                raise Exception, "unknown module `" + module + "`"
            path = self._js_modules[module]

            if(not self._loaded_modules.has_key(path)):
                self._js_logger.info("loading module <%s>...", module)
                code = file(path).read()
                try:
                    code = code.decode("utf-8")
                    if(platform.system() == "Windows"):
                        code = code.encode("utf-8")
                    self._js_ctx.eval(code)
                except JSError, ex:
                    self._js_logger.error(ex)
                    self._js_logger.debug(ex.stackTrace)
                    raise Exception, ex
                self._loaded_modules[path] = self._js_ctx.locals.exports
                return self._loaded_modules[path]
            else:
                return self._loaded_modules[path]

        def execute(self, code, args = []):
            self._js_ctx.enter()

            # use lock while jscode executing to make mutil-thread work
            while self._js_threadlock:
                pass
            self._js_threadlock = True
            try:
                if(isinstance(code, basestring)):
                    code = code.decode("utf-8")
                    if(platform.system() == "Windows"):
                        code = code.encode("utf-8")
                    r = self._js_ctx.eval(code)
                else:
                    r = apply(code, args)
                return r
            except JSError, ex:
                self._js_logger.error(ex)
                self._js_logger.debug(ex.stackTrace)
                raise Exception, ex
            finally:
                self._js_threadlock = False
                self._js_ctx.leave()

好了,我们可以测试一下:

from commonjs import CommonJS
    ctx = CommonJS()
    ctx.append("/my/js/path")
    print ctx.execute("require("base"); QW.provide("Test",{}); exports = QW.Test");

结果是 [JSObject]

在这基础上,我们进一步实现整个运行时环境和一些内置方法:

#JavaScript HTML Context for PyV8

    from PyV8 import JSClass
    from logger import logger
    import re, threading, hashlib

    import urllib,urllib2

    from w3c import parseString, Document, HTMLElement
    from commonjs import CommonJS

    import browser

    from StringIO import StringIO
    import gzip

    class JSR(CommonJS, browser.HtmlWindow):
        def __init__(self, url_or_dom, charset=None, headers={}, body={}, timeout=2):
            urllib2.socket.setdefaulttimeout(timeout)
            jsonp = False

            if(isinstance(url_or_dom, Document)):
                url = "localhost:document"
                dom = url_or_dom

            elif(url_or_dom.startswith("<")):
                url = "localhost:string"
                dom = parseString(url_or_dom)

            else: #url
                url = url_or_dom
                if(not re.match(r"\w+\:\/\/", url)):
                    url = "http://" + url

                request = urllib2.Request(url, urllib.urlencode(body), headers=headers) 
                response = urllib2.urlopen(url)

                contentType = response.headers.get("Content-Type")

                if(contentType):
                    #print contentType
                    t = re.search(r"x-javascript|json", contentType)
                    if(t):
                        jsonp = True
                    m = re.match(r"^.*;\s*charset=(.*)$", contentType)
                    if(m):
                        charset = m.group(1) 
                    #print charset

                if(not charset):
                    charset = "utf-8" #default charset
                    # guess charset from httpheader

                html = response.read()
                encoding = response.headers.get("Content-Encoding")

                if(encoding and encoding == "gzip"):
                    buf = StringIO(html)
                    f = gzip.GzipFile(fileobj=buf)
                    html = f.read()   

                self.__html__ = html
                html = unicode(html, encoding=charset, errors="ignore")
                dom = parseString(html)   

            navigator = browser.matchNavigator(headers.get("User-Agent") or "")

            browser.HtmlWindow.__init__(self, url, dom, navigator)
            CommonJS.__init__(self)

            self.console = JSConsole(self._js_logger)

            for module in "base, array.h, function.h, helper.h, object.h, string.h, date.h, custevent, selector, dom_retouch".split(","):
                self.execute(self.require, [module.strip()])

            if(jsonp):
                code = "window.data=" + html.encode("utf-8")
                self.execute(code)
                #print code

            self._js_logger.info("JavaScript runtime ready.")

        _js_timer_map = {}

        def _js_execTimer(self, id, callback, delay, repeat = False):
            code = "(function f(){ _js_timers[" + str(id) + "][1].code();"
            if(repeat):
                code = code + "_js_execTimer(" + str(id) + ", f, " + str(delay) + ", true);"
            code = code + "})();"

            #thread locking
            self._js_timer_map[id] = threading.Timer(delay / 1000.0, lambda: self.execute(code))
            self._js_timer_map[id].start()

        def setTimeout(self, callback, delay):
            timerId = super(JSR, self).setTimeout(callback, delay)
            self._js_execTimer(timerId, callback, delay, False)
            return timerId

        def clearTimeout(self, timerId):
            if(timerId in self._js_timer_map):
                self._js_timer_map[timerId].cancel()
                self._js_timer_map[timerId] = None
                super(JSR, self).clearTimeout(timerId)

        def setInterval(self, callback, delay):
            timerId = super(JSR, self).setInterval(callback, delay)
            self._js_execTimer(timerId, callback, delay, True)
            return timerId       

        def clearInterval(self, timerId):
            if(timerId in self._js_timer_map):
                self._js_timer_map[timerId].cancel()
                self._js_timer_map[timerId] = None
                super(JSR, self).clearTimeout(timerId)

        def md5(self, str):
            return hashlib.md5(str).hexdigest()

    class JSConsole(JSClass):
        def __init__(self, logger):
            self._js_logger = logger
        def log(self, msg):
            self._js_logger.info(str(msg).decode("utf-8"))

到此为止,一个简单的运行时环境就实现了。

使用方法:

from jsruntime import JSR

    rt = JSR("www.baidu.com")
    rt.execute("alert(document.title)") #结果是"百度一下,你就知道"
    rt.execute("document.querySelector("body div")") #获得body下匹配的第一个div
    ...

利用它,你就可以用js来处理抓取的页面元素,甚至通过运行页面上的js来获取动态加载的内容,只要继续扩展,可以用它来实现一个相当不错的页面内容采集服务,而采集规则,不再只能是简单的正则,而是可以用js像处理任何动态网页那样来处理抓取的页面。

有兴趣滴童鞋可以研究下,pyv8的潜力还是很大的,大家一起期待一下~

Copyright© 2013-2020

All Rights Reserved 京ICP备2023019179号-8