python读取html中指定元素生成excle文件示例

868次阅读  |  发布于5年以前

Python2.7编写的读取html中指定元素,并生成excle文件

复制代码 代码如下:

coding=gbk

import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup
from xlrd import open_workbook

class LogMsg:
def init(self,logfile,Level=0):
try:
import logging

self.logger = None

                    self.logger = logging.getLogger()  
                    self.hdlr = logging.FileHandler(logfile)  
                    formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")  
                    self.hdlr.setFormatter(formatter)  
                    self.logger.addHandler(self.hdlr)  
                    #logger.setLevel()  
                    if Level == 10:  
                            self.logger.setLevel(logging.DEBUG)  
                    elif Level == 20:  
                            self.logger.setLevel(logging.INFO)  
                    elif Level == 30:  
                            self.logger.setLevel(logging.WARNING)  
                    elif Level == 40:  
                            self.logger.setLevel(logging.ERROR)  
                    elif Level == 50:  
                            self.logger.setLevel(logging.CRITICAL)  
                    else:  
                            self.logger.setLevel(logging.NOTSET)  
            except:  
                    print "log init error!"  
                    exit(1)

    def output(self,logInfo):  
            Level = self.logger.getEffectiveLevel()  
            try:  
                    if Level == 10:  
                            self.logger.debug(logInfo)  
                    elif Level == 20:  
                            self.logger.info(logInfo)  
                    elif Level == 30:  
                            self.logger.warning(logInfo)  
                    elif Level == 40:  
                            self.logger.error(logInfo)  
                    elif Level == 50:  
                            self.logger.critical(logInfo)  
                    else:  
                            self.logger.info(logInfo)  
            except:  
                    print "log output error!"  
                    exit(1)  

    def close(self):  
            try:  
            #logging.shutdown([self.hdlr])  
                    self.logger.removeHandler(self.hdlr)  
            except:  
                    print "log closed error!"  
                    exit(1) 

Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime = time.strftime("%Y%m%d",time.localtime())
Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = '/data/pyExample/'
XLSname = 'dangjian_'+Logtime+'.xls'

if name == 'main':

wbk = xlwt.Workbook(encoding = 'gbk')  
sheet = wbk.add_sheet('基本内容导入模板')  
sheet.write(0,0,'内容类型 ')  
sheet.write(0,1,'栏目名称')  
sheet.write(0,2,'栏目编号')  
sheet.write(0,3,'内容名称')  
sheet.write(0,4,'时长')  
sheet.write(0,5,'关键字')  
sheet.write(0,6,'看点')  
sheet.write(0,7,'作者')  
sheet.write(0,8,'来源')  
sheet.write(0,9,'子内容1')  
sheet.write(0,10,'子内容2')  
xlsContent = []     
files = os.listdir(DATAPATH)  
k = 0  
for f in files:    
    if os.path.splitext(f)[1] == '.html':  
        content=[]  
        log.output('当前文件:'+f)  
        htmlFile =codecs.open(DATAPATH+f,'r','gbk')  
        lines = htmlFile.readlines()  
        if not lines:  
            log.output ('not line')  
        for line in lines:  
            if line.strip()=='\n':  
                log.output('该处是空行')  
            else:  
                line = line.replace(' ','')  
                soup  = BeautifulSoup(line)  
                for tdd in soup.findAll('td'):    
                    #print tdd.text.encode("gbk")  
                    content.append(tdd.text.encode("gbk"))         
            #print line.encode('gbk')   
        htmlFile.close()      
        for i in content:  
            print content.index(i),',',i   
            log.output(i)   
            log.output(content.index(i))   
        print '----------------------------------------'  


        folderName =  content[6]  
        contentName=  content[4]         
        duration =    filter(str.isdigit, content[16])  
        int_duration = string.atoi(duration)*60  
        str_duration = "%i"%int_duration  
        keyWord =     content[6]   
        desciption =  content[36]  
        videoName_1 = content[10]  
        print folderName  
        print contentName  
        print str_duration  
        print keyWord  
        print desciption  
        print videoName_1  
        log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')  
        print k              
        sheet.write(k+1,0,'')  
        sheet.write(k+1,1,folderName)  
        sheet.write(k+1,2,'')  
        sheet.write(k+1,3,contentName)  
        sheet.write(k+1,4,str_duration)  
        sheet.write(k+1,5,keyWord)  
        sheet.write(k+1,6,desciption)  
        sheet.write(k+1,7,'管理员')  
        sheet.write(k+1,8,'华数编辑')  
        sheet.write(k+1,9,videoName_1)  
        sheet.write(k+1,10,'')  
        k+=1  

wbk.save(DATAPATH + XLSname)          

print '========================================='    

Copyright© 2013-2020

All Rights Reserved 京ICP备2023019179号-8