Python实现批量将word转html并将html内容发布至网站的方法

473次阅读  |  发布于5年以前

本文实例讲述了Python实现批量将word转html并将html内容发布至网站的方法。分享给大家供大家参考。具体实现方法如下:


    #coding=utf-8
    __author__ = 'zhm'
    from win32com import client as wc
    import os
    import time
    import random
    import MySQLdb
    import re
    def wordsToHtml(dir):
    #批量把文件夹的word文档转换成html文件
     #金山WPS调用,抢先版的用KWPS,正式版WPS
     word = wc.Dispatch('KWPS.Application')
     for path, subdirs, files in os.walk(dir):
      for wordFile in files:
       wordFullName = os.path.join(path, wordFile)
       #print "word:" + wordFullName
       doc = word.Documents.Open(wordFullName)
       wordFile2 = unicode(wordFile, "gbk")
       dotIndex = wordFile2.rfind(".")
       if(dotIndex == -1):
        print '********************ERROR: 未取得后缀名!'
       fileSuffix = wordFile2[(dotIndex + 1) : ]
       if(fileSuffix == "doc" or fileSuffix == "docx"):
        fileName = wordFile2[ : dotIndex]
        htmlName = fileName + ".html"
        htmlFullName = os.path.join(unicode(path, "gbk"), htmlName)
        # htmlFullName = unicode(path, "gbk") + "\\" + htmlName
        print u'生成了html文件:' + htmlFullName
        doc.SaveAs(htmlFullName, 8)
        doc.Close()
     word.Quit()
     print ""
     print "Finished!"
    def html_add_to_db(dir):
    #将转换成功的html文件批量插入数据库中。
     conn = MySQLdb.connect(
      host='localhost',
      port=3306,
      user='root',
      passwd='root',
      db='test',
      charset='utf8'
      )
     cur = conn.cursor()
     for path, subdirs, files in os.walk(dir):
      for htmlFile in files:
       htmlFullName = os.path.join(path, htmlFile)
       title = os.path.splitext(htmlFile)[0]
       targetDir = 'D:/files/htmls/'
       #D:/files为web服务器配置的静态目录
       sconds = time.time()
       msconds = sconds * 1000
       targetFile = os.path.join(targetDir, str(int(msconds))+str(random.randint(100, 10000)) +'.html')
       htmlFile2 = unicode(htmlFile, "gbk")
       dotIndex = htmlFile2.rfind(".")
       if(dotIndex == -1):
        print '********************ERROR: 未取得后缀名!'
       fileSuffix = htmlFile2[(dotIndex + 1) : ]
       if(fileSuffix == "htm" or fileSuffix == "html"):
        if not os.path.exists(targetDir):
         os.makedirs(targetDir)
        htmlFullName = os.path.join(unicode(path, "gbk"), htmlFullName)
        htFile = open(htmlFullName,'rb')
        #获取网页内容
        htmStrCotent = htFile.read()
        #找出里面的图片
        img=re.compile(r"""<img\s.*?\s?src\s*=\s*['|"]?([^\s'"]+).*?>""",re.I)
        m = img.findall(htmStrCotent)
        for tagContent in m:
         imgSrc = unicode(tagContent, "gbk")
         imgSrcFullName = os.path.join(path, imgSrc)
         #上传图片
         imgTarget = 'D:/files/images/whzx/'
         img_sconds = time.time()
         img_msconds = sconds * 1000
         targetImgFile = os.path.join(imgTarget, str(int(img_msconds))+str(random.randint(100, 10000)) +'.png')
         if not os.path.exists(imgTarget):
          os.makedirs(imgTarget)
         if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))):
          tmpImgFile = open(imgSrcFullName,'rb')
          tmpWriteImgFile = open(targetImgFile, "wb")
          tmpWriteImgFile.write(tmpImgFile.read())
          tmpImgFile.close()
          tmpWriteImgFile.close()
          htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1])
        if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))):
         #用iframe包装转换好的html文件。
         iframeHtml='''
         <script type="text/javascript" language="javascript">
          function iFrameHeight() {
           var ifm= document.getElementById("iframepage");
           var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument;
           if(ifm != null && subWeb != null) {
            ifm.height = subWeb.body.scrollHeight;
           }
          }
         </script>
         <iframe src='''+targetFile.split(':')[1]+'''
          marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe>
         '''
         tmpTargetFile = open(targetFile, "wb")
         tmpTargetFile.write(htmStrCotent)
         tmpTargetFile.close()
         htFile.close()
         try:
          # 执行
          sql = "insert into common_article(title,content) values(%s,%s)"
          param = (unicode(title, "gbk"),iframeHtml)
          cur.execute(sql,param)
         except:
          print "Error: unable to insert data"
     cur.close()
     conn.commit()
     # 关闭数据库连接
     conn.close()
    if __name__ == '__main__':
     wordsToHtml('d:/word')
     html_add_to_db('d:/word')

希望本文所述对大家的Python程序设计有所帮助。

Copyright© 2013-2020

All Rights Reserved 京ICP备2023019179号-8