Python爬取国外天气预报网站的方法 - 哈喽比特

463次阅读 | 发布于6年以前

本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下：

crawl_weather.py如下：


    #encoding=utf-8
    import httplib
    import urllib2
    import time
    from threading import Thread
    import threading
    from Queue import Queue
    from time import sleep
    import re
    import copy
    lang = "fr"
    count = 0
    class Location:
      # Location(False, "中国", "北京", "zh")
      # Location(True, "", "亚洲", "zh")
      def __init__(self, is_beyond_country, country_name, loc_name, lang):
        self.country_name = country_name
        self.loc_name = loc_name
        self.lang = lang
        self.is_beyond_country = is_beyond_country
    prn_lock = threading.RLock()
    def GetLocationURLs(url, recursive):
      global count
      if url.find("weather-forecast") != -1:
        count = count + 1
        if count % 500 == 0:
          prn_lock.acquire()
          print "count:%d" % (count)
          prn_lock.release()
        return [url]
      page = urllib2.urlopen(url).read()
      time.sleep(0.01)
      #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
      pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
      locs = re.findall(pattern, page)
      locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
      if not recursive:
        urls = [url for url, name in locs]
        return urls
      urls = []
      for _url, _name in locs:
        lst = GetLocationURLs(_url, True)
        urls.extend(lst)
      return urls
    #entry_url = "http://www.accuweather.com/zh/browse-locations"
    entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
    #regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
    #regions = ["eur"]
    #region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
    #region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
    sub_urls = GetLocationURLs(entry_url, False)
    print len(sub_urls)
    print sub_urls
    q = Queue()
    location_urls = []
    ThreadNum = 5
    lock = threading.RLock()
    for url in sub_urls:
      q.put(url)
    def working():
      while True:
        url = q.get()
        lst = GetLocationURLs(url, True)
        print "%s %d urls " % (url, len(lst))
        lock.acquire()
        location_urls.extend(lst)
        lock.release()
        q.task_done()
    for i in range(ThreadNum):
      t = Thread(target=working)
      t.setDaemon(True)
      t.start()
    q.join()  
    fp = open('locations.txt', "w")
    fp.write("\n".join(location_urls))
    fp.close()
    #for url in location_urls:
    #  print url
    #location_urls = GetLocationURLs(entry_url)
    '''
    def Fetch(url):
      try:
        print url
        web_path = url[0]
        local_name = url[1]   
        print "web_path:", web_path
        print "local_name:", local_name
        sContent = urllib2.urlopen(web_path).read()
        savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
        print savePath
        file = open(savePath,'wb')
        file.write(sContent)
        file.close()
        print savePath + " saved";
      except:
        pass;
    def working():
      while True:
        url = q.get()
        Fetch(url)
        sleep(10)
        q.task_done()
    #root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
    root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
    page = urllib2.urlopen(root_url).read()
    for i in range(NUM):
      t = Thread(target=working)
      t.setDaemon(True)
      t.start()
    urls = copy.deepcopy(ppt_urls)
    urls.extend(srt_urls)
    urls.extend(video_urls)
    print len(ppt_urls)
    print len(srt_urls)
    print len(video_urls)
    print len(urls)
    for url in urls:
      q.put(url)
    q.join()
    '''
    '''
    root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
    page = urllib2.urlopen(root_url).read()
    print page
    '''

FetchLocation.py如下：


    #encoding=utf-8
    import sys
    import httplib
    import urllib2
    import time
    from threading import Thread
    import threading
    from Queue import Queue
    from time import sleep
    import re
    import copy
    from xml.dom import minidom
    import HTMLParser
    import datetime
    q = Queue()
    locks = [threading.RLock() for i in range(2)]
    ThreadNumber = 20
    locations = {}
    conds = {}
    def FindCountryBreadCrumbs(page):
      lines = page.splitlines()
      count = 0
      start = -1
      opened = False
      for line in lines:
        if line.find("<ul id=\"country-breadcrumbs\">") != -1:
          start = count
          opened = True
        if opened and line.find("</ul>") != -1:
          end = count
          opened = False
        count = count + 1
      return "\n".join(lines[start: (end + 1)])
    def GetText(nodelist):
      rc = []
      for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
          rc.append(HTMLParser.HTMLParser().unescape(node.data))
      return ''.join(rc)
    def FindCondition(page):
      pat = "<span class=\"cond\">(.*?)</span>"
      cds = re.findall(pat, page)
      cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
      return cds  
    def ExtractInfo(url):
      try:
        page = urllib2.urlopen(url).read()
      except Exception, e:
        return []
      text = FindCountryBreadCrumbs(page)
      text = HTMLParser.HTMLParser().unescape(text)
      dom = minidom.parseString(text.encode("utf-8"))
      locs = []
      lis = dom.getElementsByTagName("li")
      for li in lis:
        adr_list = li.getElementsByTagName("a")
        if adr_list:
          locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
        strs = li.getElementsByTagName("strong")
        if strs:
          locs.append(GetText(strs[0].childNodes).encode("utf-8"))
      cds = FindCondition(page)
      return locs, cds
    def AddMap(lst, m):
      for x in lst:
        if m.get(x) == None:
          m[x] = 1
    def working():
      while True:
        urls = q.get()
        #print len(urls)
        m = {}
        m2 = {}
        count = 0
        for url in urls:
          count = count + 1
          #print "%d/%d" % (count, len(urls))
          locs, cds = ExtractInfo(url)
          AddMap(locs, m)
          AddMap(cds, m2)
        locks[1].acquire()
        AddMap(m.keys(), locations)
        AddMap(m2.keys(), conds)
        locks[1].release()
        q.task_done()
    def main():
      if len(sys.argv) < 2:
        exit()
      loc_path = sys.argv[1]
      fp = open(loc_path, "r")
      urls = [line.strip() for line in fp]
      fp.close()
      #urls = urls[0:1000]
      blocks = len(urls) / ThreadNumber + 1
      for start in range(0, len(urls), blocks):
        end = start + blocks
        if end > len(urls):
          end = len(urls)
        q.put(urls[start:end])
      for i in range(ThreadNumber):
        t = Thread(target=working)
        t.setDaemon(True)
        t.start()
      q.join()
      fp = open("location_name.fr", "w")
      fp.write("\n".join(locations.keys()))
      fp.close()
      fp = open("conditions.fr", "w")
      fp.write("\n".join(conds.keys()))
      fp.close()
    if __name__ == '__main__':
      main()

希望本文所述对大家的python程序设计有所帮助。