复制代码 代码如下:
import urllib.request
import xml.dom.minidom
import sqlite3
import threading
import time
class logger(object):
def log(self,*msg):
for i in msg:
print(i)
Log = logger()
Log.log('测试下')
class downloader(object):
def init(self,url):
self.url = url
def download(self):
Log.log('开始下载',self.url)
try:
content = urllib.request.urlopen(self.url).read()
Log.log('下载完毕')
return(content)
except:
Log.log('下载出错')
return(None)
class parser(object):
def init(self,content):
self.html = xml.dom.minidom.parseString(content)
def parse(self):
Log.log('开始提取数据')
contents = {'content':'','url':[]}
divs = self.html.getElementsByTagName('div')
for div in divs:
if div.hasAttribute('class') and \
div.getAttribute('class') == 'content':
textNode = div.childNodes[0]
qContent = textNode.data
contents['content'] = qContent
spans = self.html.getElementsByTagName('span')
for span in spans:
pspan = span.parentNode
if pspan.tagName == 'a':
url = pspan.getAttribute('href')
qid = url[10:][:-4]
contents['url'].append(qid)
Log.log('提取数据完毕')
return(contents)
def downloadPage(qid,db):
url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
content = downloader(url).download()
if content:
contents = parser(content).parse()
if contents['content']:
db.updateContent(qid,contents['content'])
for i in contents['url']:
db.addQID(i)
if len(contents['url']) == 2:
db.updateStatus(qid,2)
class downloaderPool(object):
def init(self,maxLength=15):
self.downloaders = [None]*maxLength
self.downloadList = []
self.db = None
def setDownloadList(self,downloadList):
self.downloadList = list(set(self.downloadList+downloadList))
def setdb(self,db):
self.db = db
def daemon(self):
Log.log('设置守护进程')
for index,downloader in enumerate(self.downloaders):
if downloader:
if not downloader.isAlive():
Log.log('将下载器置空',index)
self.downloaders[index] = None
for index,downloader in enumerate(self.downloaders):
if not downloader:
qid = self.getQID()
if qid:
t = threading.Thread(target=downloadPage,args=(qid,self.db))
self.downloaders[index] = t
t.start()
t.join()
Log.log('设置下载器',index)
time.sleep(1)
def getQID(self):
try:
tmp = self.downloadList[0]
del self.downloadList[0]
return(tmp)
except:
return(None)
def beginDownload(self):
daemon = threading.Thread(target=self.daemon)
daemon.setDaemon(True)
daemon.start()
daemon.join()
def getDownloader(self):
for index,downloader in enumerate(self.downloaders):
if not downloader:
return(index)
return(None)
ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)'
UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?'
UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?'
Q_LIST = 'select id from qiushibaike where success=?'
Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?'
class dbConnect(object):
"""
create table qiushibaike(
id,Integer
content,Varchar
success,Interger
)
1表示未完成
2表示完成
"""
def init(self,dbpath='db.sqlite'):
self.dbpath = dbpath
def addQID(self,qid):
Log.log('插入糗事百科',qid)
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
try:
c.execute(ADD_Q_ID,(qid,1))
cn.commit()
except:
Log.log('添加ID出错',qid)
c.close()
cn.close()
Log.log('插入成功')
def updateContent(self,qid,content):
Log.log('更新糗事百科',qid,content)
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
c.execute(UPDATE_Q_CONTENT,(content,qid))
cn.commit()
c.close()
cn.close()
Log.log('更新成功')
def updateStatus(self,qid,flag):
Log.log('更新状态',qid,flag)
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
c.execute(UPDATE_Q_STATUS,(flag,qid))
cn.commit()
c.close()
cn.close()
Log.log('更新状态成功')
def getList(self,unDonloaded=1):
Log.log('获得列表')
l = []
cn = sqlite3.connect(self.dbpath)
c = cn.cursor()
c.execute(Q_LIST,(unDonloaded,))
rows = c.fetchall()
for i in rows:
l.append(i[0])
c.close()
cn.close()
Log.log('获得列表成功')
return(l)
class singleDownloader(object):
def init(self):
self.downloadList = []
def setdb(self,db):
self.db = db
def setDownloadList(self,downloadList):
self.downloadList = list(set(self.downloadList+downloadList))
def beginDownload(self):
for i in self.downloadList:
downloadPage(i,self.db)
def main():
db = dbConnect('db.sqlite')
sp = singleDownloader()
sp.setdb(db)
dp=sp
unDownloadedList = db.getList()
while(len(unDownloadedList)):
dp.setDownloadList(unDownloadedList)
dp.beginDownload()
time.sleep(1)
unDownloadedList = db.getList()
if name == 'main':
main()
代码是没问题的,可以正常运行,但是希望做到以下2方面:
1、多线程下载
2、代码分离度更高,跟面向对象
Copyright© 2013-2020
All Rights Reserved 京ICP备2023019179号-8