入门使用实例
# 查看帮助信息scrapy --help# 查看版本及组件版本信息scrapy version -v# 创建项目/工程scrapy startproject 项目/工程名称# 创建Spider(可以创建多个,但是名称不能相同)scrapy genspider 名称 采集网址scrapy genspider aaa aaa.comscrapy genspider bbb bbb.com# 列出工程中所有的Spiderscrapy list# 查看采集网址在浏览器中的样子(命令执行完会在浏览器打开)scrapy view http://www.baidu.com# 在工程中用parse解析固定的网址,一般用于测试scrapy parse http://www.baidu.com# shell可以不在具体工程中执行scrapy shell# runspider 单独执行工程中的spider文件scrapy runspider aaaaa.py# bench执行一个基准测试scrapy bench
spider类介绍
# 属性介绍name:spider的名称,要求唯一allowed_domains:准许的域名start_urls:出事urlscustom_settings:个性化设置,会覆盖全局的设置crawler:抓取器,spider将绑定到它上面settings:配置实例,包含工程中所有的配置变脸logger:日志实例# 方法介绍from_crawler(crawler, *args,**kwargs):类方法,用于创建spidersstart_requests():生成出事的requestsmake_requests_from_url(url):根据URL生成一个requestparse(response):用来解析网页内容log(message[,level,component]):用来记录日志,这里使用logger属性记录日志self.logger.info("visited success")closed(reason):当spider关闭时候调用的方法
子类介绍
CrawlSpider # 最常用的spider,用于抓取普通的网页 # 增加了两个成员 ## rules:定义了一些抓取规则-链接怎么跟踪,使用哪一个parse函数解析此链接 ## parse_start_url(response):解析初始化URL的相遇 XMLFeedSpiderCSVFeedSpiderSitemapSpider
相关组件介绍
Selector:用来解析网页的库有很多,比如beautifulsoup、lxml,但在scrapy里面默认使用的是selector,相对来说也是比较好用的# 使用from scrapy.selector import Selectorfrom scrapy.http import HtmlResponse# 使用test实例化body = 'good'Selector(text=body).xpath('//span/text()').extract()# 使用response实例化response = HtmlResponse(url='http://example.com', body=body)Selector(response=response).xpath('//span/test()').extract()Ltems
# 创建项目scrapy startproject tutorial# 创建Spiderscrapy genspider pm25# 编写Itemsimport scrapyclass Pm25CityItem(scrapy.Item): city_name = scrapy.Field() #城市的名称 home_link = scrapy.Field() #对应数据的链接地址 city_pinyin = scrapy.Field() #城市的拼音 # 完善Spiderimport scrapyfrom tutorial.items import Pm25CityItemclass Pm25Spider(scrapy.Spider): name = "pm25" allowed_domains = ["pm25.in"] start_urls = [ 'http://www.pm25.in', ] def parse(self, response): sel = scrapy.Selector(response) citys = sel.xpath("//div[@class='all']/div[@class='bottom']/ul[@class='unstyled']/div[2]/li") city_items = [] for city in citys: city_item = Pm25CityItem() href = ''.join(city.xpath('a/@href').extract()).strip() city_item['city_name'] = ''.join(city.xpath('a/text()').extract()).strip().encode("UTF-8") city_item['home_link'] = 'http://www.pm25.in' + href city_item['city_pinyin'] = href.split('/')[1] city_items.append(city_item) return city_items
配置settings.py文件
# 配置MySQL数据源MYSQL_HOST = '127.0.0.1'MYSQL_DBNAME = 'test' #数据库名字MYSQL_USER = 'root' #数据库账号MYSQL_PASSWD = '123456' #数据库密码MYSQL_PORT = 3306 #数据库端口# 配置MySQL存储的PipelineITEM_PIPELINES = { 'tutorial.pipelines.MySQLStoreDataPipeline': 300, #保存到数据库}# 数据的存储from scrapy import logfrom twisted.enterprise import adbapiimport datetime, uuidimport MySQLdbimport MySQLdb.cursorsclass MySQLStoreDataPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbargs = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWD'], charset='utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode= True,) dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.save_city, item) query.addErrback(self.handle_error) return item #插入城市的数据到tbl_all_city中 def save_city(self, conn, item): conn.execute(""" select 1 from tbl_all_city where city_pinyin = %s """, (item['city_pinyin'],)) ret0 = conn.fetchone() if not ret0: ret1 = conn.execute(""" insert into tbl_all_city(city_pinyin, city_name, home_link) values(%s, %s, %s) """, (item['city_pinyin'], item['city_name'], item['home_link'],)) log.msg('save to tbl_all_city: %s' % ret1, level=log.INFO) #异常处理 def handle_error(self, e): log.err(e) # 执行爬虫程序scrapy crawl pm25