title: 8.用Scrapy框架进行爬取的案例
date: 2019-07-31 14:15:50
tags:
- 爬虫
Scrapy
categories: Pythoncopyright: false
只有一个页面,无需跳转的爬取
创建项目: scrapy startproject 项目名
scrapy startproject myFirstScrapy
创建爬虫: scrapy genspider 爬虫名 域名
scrapy genspideer intersting u148.cn
编写items.py文件,对接需求
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
#item本质是一个字典对象,用于对页面上的待抓取的那些元素进行模型化,主要对接需求分析
class MyfirstscrapyItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
img_url = scrapy.Field()
info = scrapy.Field()
next_page = scrapy.Field()
编写爬虫(spiders目录下的爬虫文件intersting.py),并且进行解析,连接数据库时需要先创建数据库以及对应的表
# -*- coding: utf-8 -*-
import scrapy
# 导入item
from myFirstScrapy.items import MyfirstscapyItem
class InterstingSpider(scrapy.Spider):
# 这个类是基本爬虫类,提供了爬虫解析函数,以及爬虫初始url等信息
name = 'intersting'
# name爬虫的名字,在调用爬虫会根据爬虫的名字来唯一识别该爬虫
allowed_domains = ['u148.cn'] # 下载器在去下载网页的,首先会对比目标网址是不是在被允许的域名下
start_urls = ['http://www.u148.cn/music/'] # 起始地址,引擎被调度起来以后首先会从这里提取初始url
def parse(self, response):
# 当下载器把待下载的url内容下载完以后,就会回调这个方法
# print(response.text) # response对象保存下载器下完的内容
# 对response进行解析
music_list = response.xpath("//article[starts-with(@class,'ajaxpost box')]")
print(music_list) # Selector xpath="//article[starts-with(@class,'ajaxpost box')]" data='<article class="ajaxpost box triangle wo'>
# 在scrapy自带xpath中我们选取出来的对象是列表中存放该规则对应那些标签构成selector对象
# extract()函数用于从Selector对象中取出对应的标签
# 定义一个列表,用于存储所有的音乐信息
musics = []
# 遍历所有的音乐
for music in music_list:
# 创建一个item模型
item = MyfirstscapyItem()
item["title"] = music.xpath("./article/h2/a/text()").extract()[0]
# print(item["title"])
item["img_url"] = music.xpath(".//img[@class='thumb']/@data-original
").extract()[0]
item["info"] = music.xpath(".//div[@class='excerpt']/text()").extract()[0]
item["next_page"] = music.xpath("./article/h2/a/@href").extract()[0]
# 每解析出一条就将其存入列表
musics.append(item)
# print(item)
return musics
# 要求必须返回和一个可迭代的对象
# 返回的这个对象将会被scrapy框架输出(可以输出到日志中、也可以输出到本地文件中、也可以迭代输出到管道中等)
更改settings.py文件,根据需要进行修改配置
BOT_NAME = 'myFirstScrapy'
SPIDER_MODULES = ['myFirstScrapy.spiders']
NEWSPIDER_MODULE = 'myFirstScrapy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'myFirstScrapy (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 管道组件,默认是被关闭的,一旦这个组件被打开,管道文件就被引擎调用,
# 并且爬虫中返回的那个可迭代对象也会被传递到管道中
# 里面的键值对代表的是一个管道。键代表组件的位置,值代表的是管道的优先级
# (如果多个管道,优先级较高管道优先获得爬虫传递的数据),数字越大优先级级越低
ITEM_PIPELINES = {
'myFirstScrapy.pipelines.MyfirstscrapyPipeline': 300,
#开启自定义的csv管道,数字表示优先级,数字越小,优先级越高,先执行
'myFirstScrapy.pipelines.CSVPipelines': 310,
}
根据需求处理解析后的数据pipelines.py(管道文件)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import pymysql
class MyfirstscrapyPipeline(object):
def __init__(self):
pass
# 当爬虫开启的时候这个方法被调用
def open_spider(self,spider):
print("爬虫开始了")
print(spider)
# 创建数据库连接
self.conn = pymysql.connect(
host = 'localhost',
port = 3306,
user = 'root',
password = 'root',
db = 'scrapy_test',
charset = 'utf8'
)
#创建游标,将sql语句发送至mysql服务器进行执行
self.cur = self.conn.cursor()
# 这个成员方法,是一个回调函数,当爬虫向管道传递数据的时候,管道会对这些数据进行迭代,
# 每迭代一次就会调用一次这个方法,每调用一次就会把当前正在访问的那个item传过来,于此
# 同时还会把spider对象传递过来
def process_item(self, item, spider):
print(item)
print(spider)
# 每遍历一条信息插入一次
sql = "insert into u148 values(0,'%s','%s','%s','%s')"%(
item['title'],
item['img_url'],
item['info'],
item['next_page']
)
# 用游标把sql语句发生给服务器
self.cur.execute(sql)
self.conn.commit()
# 管道默认会把item 返回
# 如果不返回出去这个item那么比他优先级低的管道无法使用该item
return item
def close_spider(self,spider):
print('爬虫被关闭了')
self.conn.close()
self.cur.close()
# print(spider)
# 创建一个管道用于存储csv数据
class CSVPipelines(object):
def open_spider(self,spider):
self.mycsv = open("interst.csv",'w',encoding="utf-8")
self.csvwriter = csv.writer(self.mycsv)
# 写一个表头
self.csvwriter.writerow(["title","img_url","info","next_page"])
# 定义一个变量,用于整合所有的item信息
self.items = []
def process_item(self, item, spider):
# 整合一个小列表
csvitem = []
csvitem.append(item["title"])
csvitem.append(item["img_url"])
csvitem.append(item["info"])
csvitem.append(item["next_page"])
# 将小列表存入大列表中
self.items.append(csvitem)
return item
def close_spider(self,spider):
self.csvwriter.writerows(self.items)
self.mycsv.close()
运行项目
命令格式:scrapy crawl 爬虫名 [-o 文件名.json/xml/csv]
scrapy crawl intersting [-o intersting.csv/xml/json]
只有一个页面,但是有分页的爬取
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SchoolflowerItem(scrapy.Item):
# 姓名
name = scrapy.Field()
# 图片
img = scrapy.Field()
# 简介
info = scrapy.Field()
# 详情页url
url = scrapy.Field()
sipders/Beauties.py
# -*- coding: utf-8 -*-
import scrapy
from SchoolFlower.items import SchoolflowerItem
class BeautiesSpider(scrapy.Spider):
name = 'Beauties'
allowed_domains = ['daxues.cn']
start_urls = ['http://news.daxues.cn/xiaohua/ziliao/']
"""
['http://news.daxues.cn/xiaohua/ziliao/','http://news.daxues.cn/xiaohua/ziliao/index_2.html','http://news.daxues.cn/xiaohua/ziliao/index_3.html']
"""
def parse(self, response):
# print(response)
girl_list = response.xpath("//div[@class='xh_list']/dl")
# print(girl_list)
# 遍历
for girl in girl_list:
item = SchoolflowerItem()
item["name"] = girl.xpath("./dt/a/text()").extract()[0]
item["img"] = "http://news.daxues.cn" + girl.xpath("./a/img/@src").extract()[0]
item["info"] = girl.xpath("./dd/text()").extract()[0]
item["url"] = "http://news.daxues.cn" + girl.xpath("./dt/a/@href").extract()[0]
yield item
# 增加待爬取的页面url,并且手动调用下载器,去下载
# 定义一个变量,用于记录当前是第几页
self.page = 1
self.page += 1
if self.page <=3:
# 重新的拼接当前的url
url = self.start_urls[0] + "index_" + str(self.page) + ".html"
# 调用scrapy的下载器去下载页面
yield scrapy.Request(url=url,callback=self.parse)# 回调函数仍然使用parse,原因是所有的页面的解析都是一样
settings.py
# 1.打开管道文件
# 2.设置UA
# 3.robots.txt 规则
# 4.···
运行结果:
一级页面与二级页面的爬取
一级页面:
二级页面:
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MoviespiderItem(scrapy.Item):
# 一级页面
# title
title = scrapy.Field()
# 简介
info = scrapy.Field()
# 日期
date = scrapy.Field()
# 二级页面
# 海报
img = scrapy.Field()
# 剧情
story = scrapy.Field()
# 下载链接
downloader = scrapy.Field()
spiders/movies.py
# -*- coding: utf-8 -*-
import scrapy
from MovieSpider.items import MoviespiderItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['dytt8.net']
start_urls = ['http://www.dytt8.net/html/gndy/dyzz/index.html']
def parse(self, response):
# 从响应体中提取出所有的电影信息
movie_list = response.xpath("//div[@class='co_content8']//table")
print(movie_list)
# 遍历所有的电影,提取出详细的信息
for movie in movie_list:
# 创建一个模型
item = MoviespiderItem()
# 用item提取一级页面中的内容
item["title"] = movie.xpath(".//a/text()").extract_first()
item['date'] = movie.xpath(".//font/text()").extract_first()
item['info'] = movie.xpath(".//tr[last()]/td/text()").extract_first()
# 获取二级页面中的内容
next_url = "http://www.dytt8.net" + movie.xpath(".//a/@href").extract_first()
# 此时需要继续从二级页面中提取信息,就需要调用下载器继续下载
yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"movie_item":item})
# Request下载器,有一个参数叫meta,它可以把meta里面的内容作为响应对象的一个属性传递出去
# 定义一个成员方法,用于解析二级页面
def parse_next(self,response):
# item = MoviespiderItem()
# 提取出上个页面中未完成item
item = response.meta["movie_item"]
# 在二级页面中提取其他的信息并存入item
# 提取海报连接
item['img'] = response.xpath("//div[@id='Zoom']//img[1]/@src").extract_first()
# 提取剧情
item["story"] = response.xpath("//div[@id='Zoom']").xpath("string(.)").extract_first()
# 下载连接
item["downloader"] = response.xpath("//td[@bgcolor='#fdfddf']/a/@href").extract_first()
yield item
settigs.py 同上
运行结果:
JS动态加载的页面数据的爬取
每件商品下的原始代码
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class VipspdierItem(scrapy.Item):
# 商品介绍
info = scrapy.Field()
# 原价
orin_price = scrapy.Field()
# 现价
price = scrapy.Field()
# 折扣
discount = scrapy.Field()
# 二级页面url
next_url = scrapy.Field()
spider/vip.py
# -*- coding: utf-8 -*-
import scrapy
from VipSpdier.items import VipspdierItem
class VipSpider(scrapy.Spider):
name = 'vip'
allowed_domains = ['vip.com']
start_urls = ['http://category.vip.com/suggest.php?keyword=%E8%BF%9E%E8%A1%A3%E8%A3%99']
def parse(self, response):
goods_list = response.xpath("//div[starts-with(@id,'J_pro_')]")
print(goods_list)
for goods in goods_list:
item = VipspdierItem()
item["info"] = goods.xpath(".//a[@rel='noopener']/text()").extract_first()
# 其他,自己写
# 详情页url
next_url = "http:" +goods.xpath(".//a[@rel='noopener']/@href").extract_first()
yield scrapy.Request(url=next_url,callback=self.parse_next)
def parse_next(self,response):
print(response)
pass
settings.py 新增了一个下载中间件的开启
# 下载中间件组件,这个组件如果不开启,默认开启的是系统的下载组件
DOWNLOADER_MIDDLEWARES = {
'VipSpdier.middlewares.VipspdierDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None # 这个下载中间件在系统框架内部,如果开启这个中间件下载器的用户代理就会默认启动
}
middlewares.py
在VipspdierDownloaderMiddleware类下的process_request方法中添加一个浏览器进行js代码的解析(注:JS代码必须由浏览器内核进行解析,所以需要用到selenium添加一个浏览器,而不用系统本身的代码)
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from selenium import webdriver
from time import sleep
from scrapy.http import HtmlResponse
class VipspdierSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class VipspdierDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# 由于下载器无法解析js,在这里可以植入一个selenium驱动,用浏览器来请求
driver = webdriver.PhantomJS()
driver.get(request.url)
sleep(3)
# 获取浏览器中的页面源码
body = driver.page_source
print("=========================")
print("目前正在访问:",request.url)
# 把提取到的网页源码构建成响应体对象返回出去
return HtmlResponse(driver.current_url,body=body,encoding='utf-8',request=request)
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
运行结果:
post请求类型页面数据的爬取(案例 :百度翻译)
spiders/baidu.py
# -*- coding: utf-8 -*-
import scrapy
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
# start_urls = ['http://baidu.com/']
#
# def parse(self, response):
# pass
# 如果用post请求,需要重写下载的调度
def start_requests(self):
# 这个方法请求还没有发起的时候调用
post_url = "http://fanyi.baidu.com/sug"
# 请求体
data = {
"kw":"hello"
}
# 发起post请求
yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post)
def parse_post(self,response):
print(response.text)
pass
增量爬取(分页不知道上限)
一级页面:
一级页面下底部分页:不确定分页数
二级页面:
创建spiders目录下的爬虫文件的命令: scrapy genspider -t crawl 爬虫名 域名
scrapy genspider -t crawl dushu dushu.com
items.py
import scrapy
class DushuspiderItem(scrapy.Item):
# 一级页面
# 书名
name = scrapy.Field()
# 作者
author = scrapy.Field()
# 封面
cover_img = scrapy.Field()
# 二级页面
# 出版社
cbs = scrapy.Field()
# 内容简介
content = scrapy.Field()
# 作者简介
author_info = scrapy.Field()
# 定价
price = scrapy.Field()
# 目录
mulu = scrapy.Field()
dushu.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# crawlspider 是scrapy提供的基于basic一种更高级模板的爬虫,这种爬虫,可以通过一定的规则为下载器提供大批量的link,下载器可以自动的调用这些连接
from DushuSpider.items import DushuspiderItem
class DushuSpider(CrawlSpider):
name = 'dushu'
allowed_domains = ['dushu.com']
start_urls = ['http://www.dushu.com/book/1163.html']
rules = (
Rule(LinkExtractor(allow=r'/book/1163_\d+\.html'), callback='parse_item', follow=True),
)
# 通过rules来对爬虫进行量的扩充,rules是一个元组,里面包含多个Rule对象
# Rule对象第一个参数LinkExtractor里面传递一个匹配url的一个规则(可以是正则、xpath、bs4等规则),第二个参数回调函数,Rule规则匹配到url以后,就会把这些url全部交给调度器,调度器调用下载器,下载结束以后,就会回调回调函数,【注意】回调函数这里写成字符串
# 【注意】Rule规则会自动的把一些无效的url剔除
# LinkExtractor规则:
# allow代表 用正则来匹配(常用)
# xpath代表用 xpath语法来匹配
# css代表用 css选择器来匹配
def parse_item(self, response):
book_list = response.xpath("//div[@class='bookslist']/ul/li")
for book in book_list:
item = DushuspiderItem()
item["name"] = book.xpath(".//h3/a/text()").extract_first()
item["author"] = book.xpath(".//p/a/text()").extract_first()
item["cover_img"] = book.xpath(".//img/@data-original").extract_first()
# 跳转二级页面
next = "http://www.dushu.com" + book.xpath(".//h3/a/@href").extract_first()
yield scrapy.Request(url=next,callback=self.parse_next,meta={"book":item})
def parse_next(self, response):
item = response.meta["book"]
item['cbs'] = response.xpath("//div[@class='book-details-left']/table//tr[2]").extract_first()
item['content'] = response.xpath("//div[@class='text txtsummary']").extract()[0]
item['author_info'] = response.xpath("//div[@class='text txtsummary']").extract()[1]
item['price'] = response.xpath("//p[@class='price']/span/text()").extract_first()
m = response.xpath("//div[starts-with(@class,'text txtsummary')]/text()")
if len(m) == 3:
# 说明这时候书有目录
item["mulu"] = m.extract()[2]
else:
item["mulu"] = ''
yield item
settings.py
开启管道,robote.txt规则,UA等
pipelines.py 将数据存入数据库(需要先在数据库中创建库,创建对应的表)
import pymysql
class DushuspiderPipeline(object):
def open_spider(self,spider):
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user = 'root',
password='root',
db='dushudb',
charset='utf8'
)
self.cur = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
sql = "INSERT INTO books VALUES(NULL,'%s','%s','%s','%s','%s','%s','%s','%s')" % (item['name'],item['author'],item['cover_img'],item['cbs'],item['content'],item['author_info'],item['price'],item['mulu'])
self.cur.execute(sql)
self.conn.commit()
return item
def close_spider(self,spider):
self.cur.close()
self.conn.close()
运行结果:
评论 (0)