使用python解析xml成对应的html示例分享

1265次阅读  |  发布于5年以前

SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:

!/usr/bin/env python

-- coding: utf-8 --

---------------------------------------

程序:XML解析器

版本:01.0

作者:mupeng

日期:2013-12-18

语言:Python 2.7

功能:将xml解析成对应的html

注解:该程序用xml.sax模块的parse函数解析XML,并生成事件

继承ContentHandler并重写其事件处理函数

Dispatcher主要用于相应标签的起始、结束事件的派发

---------------------------------------

from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
def dispatch(self, prefix, name, attrs=None):
mname = prefix + name.capitalize()
dname = 'default' + prefix.capitalize()
method = getattr(self, mname, None)
if callable(method): args = ()
else:
method = getattr(self, dname, None)

args = name

    #if prefix == 'start': args += attrs  
    if callable(method): method()

def startElement(self, name, attrs):  
    self.dispatch('start', name, attrs)

def endElement(self, name):  
    self.dispatch('end', name)

class Website(Dispatcher, ContentHandler):

def __init__(self):  
    self.fout = open('ddt_SAX.html', 'w')  
    self.imagein = False  
    self.desflag = False  
    self.item = False  
    self.title = ''  
    self.link = ''  
    self.guid = ''  
    self.url = ''  
    self.pubdate = ''  
    self.description = ''  
    self.temp = ''  
    self.prx = ''  
def startChannel(self):  

    self.fout.write('''<html>\n<head>\n<title> RSS-''')  

def endChannel(self):  
   self.fout.write('''  
                <tr><td height="20"></td></tr>  
                </table>  
                </center>  
                <script>  
function  GetTimeDiff(str)  
{  
 if(str == '')  
 {  
  return '';  
 }

 var pubDate = new Date(str);  
 var nowDate = new Date();  
 var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();  
 var days = diffMilSeconds/86400000;  
 days = parseInt(days);

 diffMilSeconds = diffMilSeconds-(days*86400000);  
 var hours = diffMilSeconds/3600000;  
 hours = parseInt(hours);

 diffMilSeconds = diffMilSeconds-(hours*3600000);  
 var minutes = diffMilSeconds/60000;  
 minutes = parseInt(minutes);

 diffMilSeconds = diffMilSeconds-(minutes*60000);  
 var seconds = diffMilSeconds/1000;  
 seconds = parseInt(seconds);  

 var returnStr = "±±¾(C)・¢²¼Ê±¼a£º" + pubDate.toLocaleString();

 if(days > 0)  
 {  
  returnStr = returnStr + " £¨¾aÀeÏÖÔÚ" + days + "Ìi" + hours + "Сʱ" + minutes + "・ÖÖÓ£(C)";  
 }  
 else if (hours > 0)  
 {  
  returnStr = returnStr + " £¨¾aÀeÏÖÔÚ" + hours + "Сʱ" + minutes + "・ÖÖÓ£(C)";  
 }  
 else if (minutes > 0)  
 {  
  returnStr = returnStr + " £¨¾aÀeÏÖÔÚ" + minutes + "・ÖÖÓ£(C)";  
 }

 return returnStr;

}

function GetSpanText()  
{  
 var pubDate;  
 var pubDateArray;  
 var spanArray = document.getElementsByTagName("span");

 for(var i = 0; i < spanArray.length; i++)  
 {  
  pubDate = spanArray[i].innerHTML;  
  document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate);     
 }  
}

GetSpanText();  
''') self.fout.close()
def characters(self, chars):  
    if chars.strip():  
        #chars = chars.strip()  
        self.temp += chars  
        #print self.temp  


def startTitle(self):  

    if self.item:  
        self.fout.write('''  
                    <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<B>  
                ''')  

def endTitle(self):  

    if not self.imagein and not self.item:  
        self.title = self.temp  
        self.temp = ''  
        self.fout.write(self.title.encode('gb2312'))  

        #self.title = self.temp  
        self.fout.write('''  
            </title>\n</head>\n<body>\n<center>\n  
            <script>\n

                    function copyLink()  
                    {  
                            clipboardData.setData("Text",window.location.href);  
                            alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼oÌu°a");  
                    }

                    function subscibeLink()  
                    {  
                            var str = window.location.pathname;  
                            while(str.match(/^\//))  
                            {  
                                    str = str.replace(/^\//,"");  
                            }  
                            window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");

                    }  
                    </script>\n  
            <table width="750" cellpadding="0" cellspacing="0">\n  
            <tr>\n  
            <td align="right" style="padding-right:15px;" valign="bottom">\n  
        ''')  

    if self.item:  
        self.title = self.temp  
        self.temp = ''  
        self.fout.write(self.title.encode('gb2312'))  
        self.fout.write('''  
                    </B>  
                    </td>  
                    </tr>  
                    <tr bgcolor="#eeeeee">  
                    <td style="padding-left:5px;">  
                    ''')

def startImage(self):  
    self.imagein = True

def endImage(self):  
    self.imagein = False  

def startLink(self):  
    if self.imagein:  
        self.fout.write('''<A href=" ''')  


def endLink(self):  
    self.link = self.temp  
    self.temp = ''  
    if self.imagein:  
        self.fout.write(self.link.encode('gb2312'))  
        self.fout.write('''" target="_blank">\n ''')  
    elif self.item:  
        #self.link = self.temp  
        pass  
    else:  
        self.fout.write(self.link)  
        self.fout.write(''' " target="  
  _blank  
 "> ''')  
        self.fout.write(self.title.encode('gb2312'))  
        self.fout.write(''' </A></B></td>  
                        </tr>  
                        <tr><td colspan="2" align="center">  
                        ''')  
        self.fout.write(self.description.encode('gb2312'))  
        self.fout.write('''  
                    </td></tr>  
                    <tr style="font-size:12px;" bgcolor="#eeeeff"><td colspan="2" style="font-size:14px;padding-top:5px;padding-bottom:5px;"><b><a href="javascript:copyLink();">¸´ÖÆ´ËÒ³Á´½Ó</a>                <a href="javascript:subscibeLink();">ÎÒҪǶÈe¸ÃÐÂÎÅÁбiµ½ÎÒµÄÒ³Ãae£¨¼oµ¥¡¢¿iËÙ¡¢ÊµÊ±¡¢Ãa・Ñ£(C)</a></b></td></tr>  
                    </table>  
                    <table width="750" cellpadding="0" cellspacing="0">  
                        ''')

def startUrl(self):  
    if self.imagein:  
        self.fout.write('''<IMG src=" ''')  
def endUrl(self):  
    self.url = self.temp  
    self.temp = ''  
    if self.imagein:  
        self.fout.write(self.url.encode('gb2312'))  
        self.fout.write('''" border="0">\n  
                        </A>  
                        </td>  
                        <td align="left" valign="bottom" style="padding-bottom:8px;"><B><A href="  
                        ''')  
    if self.item:  
        #self.url = self.temp  
        pass

def defaultStart(self):  
    pass  
def defaultEnd(self):  
    self.temp = ''  
def startDescription(self):  
    pass  
def endDescription(self):  
    self.description = self.temp  
    self.temp = ''  
    if self.item:  
        #self.fout.write('¡¡¡¡')  
        self.fout.write(self.description.encode('gb2312'))  

def endGuid(self):  
    self.guid = self.temp  
def endPubdate(self):  
    if not self.temp.startswith('http'):  
     self.pubdate = self.temp  
     self.temp = ''  
    else:  
        self.pubdate = ''  
def startItem(self):  
    self.item = True  
def endItem(self):  
    self.item = False  
    self.fout.write('''  
                        </td>  
                        </tr>  
                        <tr bgcolor="#eeeeee">  
                        <td style="padding-top:5px;padding-left:5px;">  
                        <A href="''')  
    self.fout.write(self.link)  
    self.fout.write(''' " target="_blank"> ''')  
    self.fout.write(self.guid)  
    self.fout.write('''  
                    </A>  
                    </td>  
                    </tr>  
                    <tr bgcolor="#eeeeee">  
                    <td style="padding-top:5px;padding-left:5px;padding-bottom:5px;"><span>''')  
    self.fout.write(self.pubdate)  
    self.fout.write('''</span></td>  
                    </tr>  
                    <tr height="10"><td></td></tr>''')

程序入口

if name == 'main':
parse('ddt.xml', Website())

Copyright© 2013-2020

All Rights Reserved 京ICP备2023019179号-8