复制代码 代码如下:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from cnbeta.items import CnbetaItem
class CBSpider(CrawlSpider):
name = 'cnbeta'
allowed_domains = ['cnbeta.com']
start_urls = ['http://www.jb51.net']
rules = (
Rule(SgmlLinkExtractor(allow=('/articles/.*\\.htm', )),
callback='parse_page', follow=True),
)
def parse_page(self, response):
item = CnbetaItem()
sel = Selector(response)
item['title'] = sel.xpath('//title/text()').extract()
item['url'] = response.url
return item
Copyright© 2013-2020
All Rights Reserved 京ICP备2023019179号-8