复制代码 代码如下:
'''
Created on 2014-4-24
@author: Leon Wong
'''
import urllib2
import urllib
import re
import time
import os
import uuid
def findUrl2(html):
re1 = r'http://tuchong.com/\d+/\d+/|http://\w+(?<!photos).tuchong.com/\d+/'
url2list = re.findall(re1,html)
url2lstfltr = list(set(url2list))
url2lstfltr.sort(key=url2list.index)
return url2lstfltr
def getHtml(url):
html = urllib2.urlopen(url).read().decode('utf-8')#解码为utf-8
return html
def download(html_page , pageNo):
x = time.localtime(time.time())
foldername = str(x.__getattribute__("tm_year"))+"-"+str(x.__getattribute__("tm_mon"))+"-"+str(x.__getattribute__("tm_mday"))
re2=r'http://photos.tuchong.com/.+/f/.+\\.jpg'
imglist=re.findall(re2,html_page)
print imglist
download_img=None
for imgurl in imglist:
picpath = 'D:\\\TuChong\\\%s\\\%s' % (foldername,str(pageNo))
filename = str(uuid.uuid1())
if not os.path.exists(picpath):
os.makedirs(picpath)
target = picpath+"\\\%s.jpg" % filename
print "The photos location is:"+target
download_img = urllib.urlretrieve(imgurl, target)#将图片下载到指定路径中
time.sleep(1)
print(imgurl)
return download_img
def quitit():
print "Bye!"
exit(0)
if name == 'main':
print '''
Welcome to Spider for TUCHONG
Created on 2014-4-24
@author: Leon Wong
'''
pageNo = raw_input("Input the page number you want to scratch (1-100),please input 'quit' if you want to quit>")
while not pageNo.isdigit() or int(pageNo) > 100 :
if pageNo == 'quit':quitit()
print "Param is invalid , please try again."
pageNo = raw_input("Input the page number you want to scratch >")
#针对图虫人像模块来爬取
html = getHtml("http://tuchong.com/tags/%E4%BA%BA%E5%83%8F/?page="+str(pageNo))
detllst = findUrl2(html)
for detail in detllst:
html2 = getHtml(detail)
download(html2,pageNo)
print "Finished."
Copyright© 2013-2020
All Rights Reserved 京ICP备2023019179号-8