使用Python3编写抓取网页和只抓网页图片的脚本 - 哈喽比特

933次阅读 | 发布于6年以前

最基本的抓取网页内容的代码实现：


    #!/usr/bin/env python 

    from urllib import urlretrieve 

    def firstNonBlank(lines): 
      for eachLine in lines: 
        if not eachLine.strip(): 
          continue 
        else: 
          return eachLine 

    def firstLast(webpage): 
      f = open(webpage) 
      lines = f.readlines() 
      f.close() 
      print firstNonBlank(lines), 
      lines.reverse() 
      print firstNonBlank(lines), 

    def download(url='http://www',process=firstLast): 
      try: 
        retval = urlretrieve(url)[0] 
      except IOError: 
        retval = None 
      if retval: 
        process(retval) 

    if __name__ == '__main__': 
      download()

利用urllib模块，来实现一个网页中针对图片的抓取功能：


    import urllib.request 
    import socket 
    import re 
    import sys 
    import os 
    targetDir = r"C:\Users\elqstux\Desktop\pic" 
    def destFile(path): 
      if not os.path.isdir(targetDir): 
        os.mkdir(targetDir) 
      pos = path.rindex('/') 
      t = os.path.join(targetDir, path[pos+1:]) 
      return t 

    if __name__ == "__main__": 
      hostname = "http://www.douban.com" 
      req = urllib.request.Request(hostname) 
      webpage = urllib.request.urlopen(req) 
      contentBytes = webpage.read() 
      for link, t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes))): 
        print(link) 
        urllib.request.urlretrieve(link, destFile(link))


    import urllib.request 
    import socket 
    import re 
    import sys 
    import os 
    targetDir = r"H:\pic" 
    def destFile(path): 
      if not os.path.isdir(targetDir): 
        os.mkdir(targetDir) 
      pos = path.rindex('/') 
      t = os.path.join(targetDir, path[pos+1:]) #会以/作为分隔 
      return t 

    if __name__ == "__main__": 
      hostname = "http://www.douban.com/" 
      req = urllib.request.Request(hostname) 
      webpage = urllib.request.urlopen(req) 
      contentBytes = webpage.read() 
      match = re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes) )#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号，故有两个分组， 
                                 #上面会返回列表，括号中匹配的内容才会出现在列表中 
      for picname, picType in match: 
        print(picname) 
        print(picType) 


    ''''' 
    输出： 
    http://img3.douban.com/pics/blank.gif 
    gif 
    http://img3.douban.com/icon/g111328-1.jpg 
    jpg 
    http://img3.douban.com/pics/blank.gif 
    gif 
    http://img3.douban.com/icon/g197523-19.jpg 
    jpg 
    http://img3.douban.com/pics/blank.gif 
    gif 
    ... 
    '''