爬虫就是通过编写程序模拟浏览器上网,让其去互联网中抓取数据的过程。
风险
规避风险
基于网络请求的模块。
pip install requests
# 导入 import requests # 指定url url = 'https://www.xxx.com/' # UA伪装 headers = {<!-- --> 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } #请求参数 param = {<!-- --> 'page': 1 } # 表单形式参数 data = {<!-- --> 'username':'admin', 'password':'123456' } # 发起请求 response = requests.get(url,headers=headers,params=param,data =data ) # 修改响应数据的编码格式 response.encoding = 'utf-8' # 获取响应数据,text属性返回的是字符串形式的响应数据 page_text = response.text # 将json数据进行序列化 movie_list = response.json() #返回二进制类型的响应数据 pic_data = requests.get(url=url,headers=headers).content # 返回相应头,如果请求的是文件包含文件的类型 response.headers["Content-Type"] # 数据解析 ex = '' # 爬虫中使用findall函数必须要使用re.S img_src_list = re.findall(ex,page_text,re.S) print(page_text) import xmltodict import json # js_dic = json.dumps(xmltodict.parse(text)) # 将xml格式转换为json格式
安装
pip install bs4 pip install lxml
使用
from bs4 import BeautifulSoup main_url = 'http://www.baidu.com' page_text = requests.get(main_url,headers=headers).text soup = BeautifulSoup(page_text,'lxml') # 定位到第一个出现的p标签.返回的是单数 soup.p # 属性定位,定位class为c1的div,只有class需要写成class_,其它不变,获取一条数据 soup.find('div',class_='c1') # 定位class为c1的全部div,获取多条数据,列表 data = soup.find_all('div',class_='c1') # 选择器定位 soup.select('#c1') # > 表示一个层级 soup.select('.i1 > ul > li') # 空格表示多个层级 soup.select('.i1 li') # 取出文本 data = soup.select('#c1') data.string # 将标签中直系的文本取出 data.txt # 将标签中所有的文本取出 # 取出属性 data = soup.select('#c1')[0] data["href"]
安装
pip install lxml
使用
xpath表达式获取的结果是列表
from lxml import etree url = 'https://www.xx.com' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) # 找到html标签下的head标签下的meta tree.xpath('/html/head/meta') # 找到html标签下的meta tree.xpath('/html//meta') # 找到meta tree.xpath('//meta') # 找到class为c1的div tree.xpath('//div[@class="c1"]') # 从所有div总获取第一个,xpath中索引从1开始 tree.xpath('//div[1]') # 获取id为i1的div中的文本内容,/text将标签中直系的文本取出 tree.xpath('//div[@id="i1"]/text()')[0] # //text将标签中所有的文本取出 tree.xpath('//div[2]//text()') # 获取id为i1的a标签的href属性 tree.xpath('//a[@id="i1"/@href') # 局部处理乱码 data = tree.xpath('//div[2]//text()') data.encode('iso-8859-1').decode('gbk') # 管道符:使得xpath表达式具有更强的通用性 tree.xpath('//div[2]//text() | //div[3]//text()' )
爬虫程序在短时间内对指定的服务器发起了一个高频的请求,则请求对应的ip可能会被目的服务器禁止。通过使用使用请求转发的机制使得目的服务器接收到的请求对应ip的一个改变。
url = 'https://www.xxx.com/' page_text = requests.get(url=url,headers=headers,proxies={<!-- -->'https':'代理ip:端口'}).text # proxies:指定代理服务器
代理池起始就是一个列表,里面存放多个字典,使用代理池需要通过数据解析构建代理词,格式为
[ {<!-- -->'https':'ip:端口'}, {<!-- -->'http':'ip:端口'} ]
使用代理池
from lxml import etree import random url = 'https://www.xx.com/nn/' all_ips = [ {<!-- -->'https':'ip:端口'}, {<!-- -->'http':'ip:端口'} ] # 代理池 for page in range(30): page_text = requests.get(r,headers=headers,proxies=random.choice(all_ips)).text # 使用代理池
# session可以像requests模块一样调用get和post进行请求发送 session = requests.Session() # 第一次请求发送:为了捕获cookie且存储到session对象中 first_url = 'https://xxx.com/' session.get(first_url,headers=headers) # 第二次请求发送:携带者cookie进行的请求发送 url = 'https://xxx2.com/' json_data = session.get(url=url,headers=headers).json()
借助打码平台处理验证码
超级鹰
云打码
超级鹰使用流程:
1.创建一个软件得到软件id
2.下载示例代码
思路:首先将页码验证码保存到本地,然后借助打码平台识别验证码得到字符串格式的验证码
# 超级鹰提供的代码 import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = {<!-- --> 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = {<!-- --> 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = {<!-- --> 'codetype': codetype, } params.update(self.base_params) files = {<!-- -->'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = {<!-- --> 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
# 用作识别验证码的函数 def transformCode(imgPath,imgType): # imgPath:图片路径 # imgType:图片类型 chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') im = open(imgPath, 'rb').read() return chaojiying.PostPic(im, imgType)['pic_str']
# 使用示例 url = 'https://xxx/login' page_text = requests.get(url,headers=headers).text tree = etree.HTML(page_text) # 解析验证码图片的地址 code_img_src = tree.xpath('//*[@id="imgCode"]/@src')[0] img_data = requests.get(code_img_src,headers=headers).content # 将图片保存在本地 with open('./code.jpg','wb') as fp: fp.write(img_data) # 调用函数得到识别后的验证码 code = transformCode('./code.jpg',1902)
借助线程池提高爬取效率
import requests import time from multiprocessing.dummy import Pool pool = Pool(3) # 指定url urls = [ 'http://127.0.0.1:5000/index', 'http://127.0.0.1:5000/index', 'http://127.0.0.1:5000/index' ] #用作与网络请求(耗时) def req(url): return requests.get(url).text # pool.map(req,urls)让func回调函数处理alist中的每一个列表元素,这个处理的过程是异步的。 page_text_list = pool.map(req,urls) print(page_text_list)
多任务异步协程
requests模块本身不支持异步,需要借助aiohttp模块
import asyncio # 导入 import aiohttp import time from bs4 import BeautifulSoup #将被请求的url全部整合到一个列表中 urls = ['http://127.0.0.1:5000/index','http://127.0.0.1:5000/index','http://127.0.0.1:5000/index'] start = time.time() async def get_request(url): # 在每个with前添加async 在每个阻塞前添加await async with aiohttp.ClientSession() as s: #s.get(url,headers,params,proxy="http://ip:port") # 使用with方式自动解决资源释放问题 async with await s.get(url) as response: #response.read() 获取2进制数据 page_text = await response.text() # 获取文本数据 return page_text def parse(task): page_text = task.result() soup = BeautifulSoup(page_text,'lxml') data = soup.find('div',class_="tang").text print(data) tasks = [] #存储的是所有的任务对象。多任务! for url in urls: c = get_request(url) # 创建一个任务对象 task = asyncio.ensure_future(c) #给任务对象绑定一个回调函数 task.add_done_callback(parse) tasks.append(task) #创建一个事件循环对象 loop = asyncio.get_event_loop() #将任务对象注册到该对象中并且开启该对象 #asyncio.wait(tasks):给每一个任务对象赋予一个可被挂起的权限 loop.run_until_complete(asyncio.wait(tasks))
使用 PyExecJS 库来实现模拟JavaScript代码执行
安装
pip install PyExecJS
除了PyExecJS还要安装nodeJs的开发环境
使用
import execjs node = execjs.get() # 读取js文件 file = 'test.js' ctx = node.compile(open(file,encoding='utf-8').read()) # 执行js js = 'getPostParamCode("{0}")'.format("") params = ctx.eval(js)
selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题 selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器
安装模块
pip install selenium
下载浏览器驱动程序
from selenium import webdriver # 通过浏览去的驱动实例化一个浏览器对象 bro = webdriver.Chrome(executable_path='./chromedriver.exe') url = 'https://www.xx.com/' # 发送请求 bro.get(url) # 定位标签(例如input) search = bro.find_element_by_id('key') # 通过id定位 search = bro.find_element_by_xpath('//input[@id="key"]') # 通过xpath表达式定位 # 向指定标签中输入文本数据 search.send_keys('xxx') # 触发点击事件(按钮等) search.click() # 捕获到当前页面的数据 bro.page_source # 处理弹出的警告页面 确定accept() 和 取消dismiss() driver.switch_to_alert().accept() # 关闭浏览器 bro.quit()
from selenium import webdriver url = 'https://www.xx.com/' bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get(url) # js注入 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
规避检测
from selenium import webdriver from selenium.webdriver import ChromeOptions option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option) url = 'https://www.xxx.com/' bro.get(url)
不会打开浏览器窗口
from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) url = 'https://www.xxx.com/' bro.get(url)
from selenium import webdriver from selenium.webdriver import ActionChains#动作连 from time import sleep bro = webdriver.Chrome(executable_path='./chromedriver.exe') url = 'https://www.xx.com/' bro.get(url) # 当定位的标签是存在与iframe之中,则就会抛出NoSuchElementException错误 # 解决方法:switch_to.frame进行指定子页面的切换 bro.switch_to.frame('iframeResult') # 选中标签 div_tag = bro.find_element_by_xpath('//*[@id="draggable"]') #实例化一个动作连对象 action = ActionChains(bro) action.click_and_hold(div_tag)#点击且长按 #perform()让动作连立即执行 for i in range(5): action.move_by_offset(xoffset=15,yoffset=15).perform() action.release() bro.quit()
适用于点击图中所有xx类验证码
from selenium import webdriver from selenium.webdriver import ActionChains from time import sleep from PIL import Image #安装PIL或者是Pillow from CJY import Chaojiying_Client #封装一个识别验证码的函数 def transformCode(imgPath,imgType): chaojiying = Chaojiying_Client('xx', 'xx', '899370') im = open(imgPath, 'rb').read() return chaojiying.PostPic(im, imgType)['pic_str'] bro = webdriver.Chrome(executable_path='./chromedriver.exe') bro.get('https://www.xxx.cn/') # 将当前浏览器页面进行图片保存 bro.save_screenshot('./main.png') # 将验证码的局部区域进行裁剪 # 捕获标签在页面中的位置信息 img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img') location = img_tag.location # 标签的起始位置坐标(左下角坐标) size = img_tag.size # 标签的尺寸 #裁剪范围对应的矩形区域 rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) #使用Image工具进行指定区域的裁剪 i = Image.open('./main.png') frame = i.crop(rangle)#crop就是根据指定的裁剪范围进行图片的截取 frame.save('code.png') #调用打码平台进行验证码的识别 result = transformCode('./code.png',9004) print(result) #x1,y1|x2,y2|x3,y3 all_list = []#[[x1,y1],[x2,y2],[x3,y3]] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) for point in all_list: x = point[0] y = point[1] # 对指定坐标点击 ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform() bro.find_element_by_id('username').send_keys('xxxxxx') bro.find_element_by_id('password').send_keys('xxxx') bro.find_element_by_id('loginSub').click() print(bro.page_source) bro.quit()
Scrapy是一套基于Twisted的异步处理框架,是纯python实现的爬虫框架,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容或者各种图片。
linux:
pip3 install scrapy
Windows:
1. pip3 install wheel 2. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted 3. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl 4. pip3 install pywin32 5. pip3 install scrapy
指令
创建工程
scrapy startporject proName
进入工程目录
cd proName
创建爬虫文件
scrapy genspider spiderName www.xxx.com
执行工程
scrapy crawl spiderName
settings
# UA伪装 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' # 是否遵从ROBOTSTXT协议 ROBOTSTXT_OBEY = False # 日志登记 LOG_LEVEL = 'ERROR' # 日志目录 # LOG_FILE = './log.txt' # 处理cookies # COOKIES_ENABLED = True
爬虫文件
import scrapy class FirstSpider(scrapy.Spider): #爬虫名称:当前爬虫文件的唯一标识 name = 'xx' #允许的域名 # allowed_domains = ['www.xxx.com'] # 列表中的url就会被进行请求发送 start_urls = ['http://www.xx.com/'] # 数据解析 def parse(self, response): article_list = response.xpath('/html/body/section/div/div/main/article') for article in article_list: #xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象 # extract_first只可以作用到列表中的第一个列表元素中,返回的是字符串 title = article.xpath('./div[1]/h1/a/text()').extract_first() # extract可以作用到列表中的每一个列表元素中,返回的依然是一个列表 content = article.xpath('./div[2]//text()').extract() print(title,content)
只可以将parse方法的返回值存储到指定后缀的文本文件中。
scrapy crawl spiderName -o ./res.csv
# 爬虫文件 import scrapy from DuanziPro.items import DuanziproItem 管道机制对应的操作 class DuanziSpider(scrapy.Spider): name = 'duanzi' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.xx.com/'] def parse(self, response): all_data = [] article_list = response.xpath('/html/body/section/div/div/main/article') for article in article_list: title = article.xpath('./div[1]/h1/a/text()').extract_first() content = article.xpath('./div[2]//text()').extract() content = ''.join(content) #实例化一个item类型的对象,然后将解析到的一组数据存进去 item = DuanziproItem() item['title'] = title item['content'] = content yield item #将item提交给管道
# DuanziSpider/item.py import scrapy class DuanziproItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field()
# DuanziSpider/pipelines.py #写入到文本文件中 import pymysql from redis import Redis class DuanziproPipeline(object): fp = None def open_spider(self,spider): """爬虫开启""" self.fp = open('./duanzi.txt','w',encoding='utf-8') #方法每被调用一次,参数item就是其接收到的一个item类型的对象 def process_item(self, item, spider): self.fp.write(item['title']+':'+item['content']+'\n') return item # 可以将item提交给下一个即将被执行的管道类 def close_spider(self,spider): """爬虫结束""" self.fp.close() #将数据写入到mysql class MysqlPipeLine(object): conn = None cursor = None def open_spider(self,spider): self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='222',db='spider',charset='utf8') print(self.conn) def process_item(self,item,spider): sql = 'insert into duanzi values ("%s","%s")'%(item['title'],item['content']) self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): self.cursor.close() self.conn.close() #将数据写入到redis class RedisPileLine(object): conn = None def open_spider(self,spider): self.conn = Redis(host='127.0.0.1',port=6379) print(self.conn) def process_item(self,item,spider): self.conn.lpush('duanziData',item) return item
# settings.py ITEM_PIPELINES = {<!-- --> #300表示的是优先级,数值越小优先级越高 'DuanziPro.pipelines.DuanziproPipeline': 300, # 'DuanziPro.pipelines.MysqlPipeLine': 301, # 'DuanziPro.pipelines.RedisPileLine': 302, }
class DuanziSpider(scrapy.Spider): name = 'duanzi' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.xxx.com/'] #通用的url模板 url = 'http://xxx.com/category/xx/%d/' pageNum = 1 # 重写start_requests函数自定义发送请求 # def start_requests(self): # for url in self.start_urls: # yield scrapy.Request(url,callback=self.parse) def parse(self, response): all_data = [] article_list = response.xpath('/html/body/section/div/div/main/article') for article in article_list: title = article.xpath('./div[1]/h1/a/text()').extract_first() content = article.xpath('./div[2]//text()').extract() content = ''.join(content) item = DuanziproItem() item['title'] = title item['content'] = content yield item #编写手动请求的操作 if self.pageNum < 5: self.pageNum += 1 new_url = format(self.url%self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)
当要要爬取的数据不在同一个页面时
class MovieSpider(scrapy.Spider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.xxx.com/xx/id/1.html'] url = 'https://www.xxx.com/xx/id/1/page/%d.html' pageNum = 1 def parse(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: title = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first() # 拼接url detail_url = 'https://www.xxx.com'+li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first() item = MovieproItem() item['title'] = title #对详情页的url进行手动请求发送 #请求传参: #参数meta是一个字典,字典会传递给callback yield scrapy.Request(detail_url,callback=self.parse_detail,meta={<!-- -->'item':item}) #全栈爬取 if self.pageNum < 4: self.pageNum += 1 new_url = format(self.url%self.pageNum) yield scrapy.Request(new_url,callback=self.parse) #自定义的另一个解析方法(必须要有response参数) def parse_detail(self,response): #接收传递过来的meta item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
增加并发:
默认scrapy开启的并发线程为16个,可以适当进行增加。在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100。
降低日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:LOG_LEVEL = ‘ERROR’
禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:COOKIES_ENABLED = False
禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:RETRY_ENABLED = False
减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:DOWNLOAD_TIMEOUT = 10 超时时间为10s
爬虫类中将解析到的图片地址存储到item,将item提交给指定的管道
在管道文件中导入ImagesPipeline
基于ImagesPipeline父类,自定义一个管道类
# settings ITEM_PIPELINES = {<!-- --> 'imgPro.pipelines.ImgproPipeline': 300, } #指定文件存储的目录 IMAGES_STORE = './imgs'
# 爬虫文件 import scrapy from imgPro.items import ImgproItem class ImgdemoSpider(scrapy.Spider): name = 'imgDemo' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.xx.com/daxuemeinv/'] def parse(self, response): li_list = response.xpath('//*[@id="content"]/div[2]/div[2]/ul/li') for li in li_list: img_src = 'http://www.521609.com'+li.xpath('./a[1]/img/@src').extract_first() img_name = li.xpath('./a[2]/b/text() | ./a[2]/text()').extract_first()+'.jpg' item = ImgproItem() item['img_src'] = img_src item['img_name'] = img_name yield item
# pipelines.py from scrapy.pipelines.images import ImagesPipeline import scrapy class ImgproPipeline(ImagesPipeline): #指定文件存储的目录(文件名) def file_path(self,request,response=None,info=None): # IMAGES_STORE=‘./imgs #接收mate item = request.meta['item'] return item['img_name'] #将item中存储的图片地址进行get请求发送 def get_media_requests(self,item,info): #meta可以传递给file_path yield scrapy.Request(item['img_src'],meta={<!-- -->'item':item}) #用于返回item,将item传递给下一个即将被执行的管道类 def item_completed(self,request,item,info): return item
拦截所有的请求和响应
# settings DOWNLOADER_MIDDLEWARES = {<!-- --> 'middlePro.middlewares.MiddleproDownloaderMiddleware': 543, }
# middlewares.py from scrapy import signals import random class MiddleproDownloaderMiddleware(object): #拦截正常请求 #参数request:拦截到的请求 user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self, request, spider): print('proces_request!!!') #UA伪装 request.headers['User-Agent'] = random.choice(self.user_agent_list) return None #拦截所有的响应 def process_response(self, request, response, spider): return response #拦截发生异常的请求,目的就是为了将异常的请求进行修正,然后将修正之后的正常的请求进行重新发送 def process_exception(self, request, exception, spider): # 代理操作 # request.meta['proxy'] = 'http://ip:port' print('i am exception!!!') return request
# middlewares.py from scrapy.http import HtmlResponse from time import sleep class WproDownloaderMiddleware(object): def process_request(self, request, spider): return None #拦截所有的响应(1+5+n),只有5个响应不满足需求 def process_response(self, request, response, spider): #1.将拦截到所有的响应中的指定5个不满足需求的响应对象找出 # request.url:每一个响应对应的url #spider.five_model_urls:5个板块对应的url # print(spider.five_model_urls) if request.url in spider.five_model_urls: #满足if条件的response就是5个板块对应的response spider.bro.get(request.url)#对每一个板块对应的url进行get请求发送 sleep(3) spider.bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) spider.bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) page_text = spider.bro.page_source new_response = HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request) return new_response #2.将这5个响应对象删除,实例化5个新的响应对象 #3.保证5个新的响应对象中包含动态加载出来的新闻标题数据 #4.将满足需求的5个新的响应对象返回 else: return response def process_exception(self, request, exception, spider): pass
# 爬虫文件 import scrapy from selenium import webdriver from wangyiPro.items import WangyiproItem class WangyiSpider(scrapy.Spider): name = 'wangyi' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.xxx.com/'] #整个项目中涉及的响应对象个数: # - 1+5+n #解析:解析五个新闻板块对应的url five_model_urls = [] bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据+算法\chromedriver.exe') #方法只会被调用一次 def closed(self,spider): self.bro.quit() def parse(self, response): li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li') model_indexs = [3,4,6,7,8] for index in model_indexs: li_tag = li_list[index] #解析出了每一个板块对应的url model_url = li_tag.xpath('./a/@href').extract_first() self.five_model_urls.append(model_url) #对每一个板块的url进行手动的请求发送 yield scrapy.Request(model_url,callback=self.parse_model) #解析:每一个板块中的新闻标题和新闻详情页的url(两个值都是动态加载出来的) def parse_model(self,response): #遇到了不满足需求的响应对象就是当前方法中的response参数 div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: title = div.xpath('./div/div[1]/h3/a/text()').extract_first() detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() # print(detail_url) item = WangyiproItem() item['title'] = title if detail_url: # print(detail_url) yield scrapy.Request(detail_url,callback=self.parse_detail,meta={<!-- -->'item':item}) def parse_detail(self,response): item = response.meta['item'] content = response.xpath('//*[@id="endText"]//text()').extract() content = ''.join(content) item['content'] = content yield item
Spider的一个子类
全站数据爬取
创建工程
创建爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
LinkExtracor链接提取器
Rule规则解析器
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunPro.items import SunproItem_content,SunproItem class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://xx.com/index.php/question/questionType?type=4&page='] #实例化了一个链接提取器对象 #作用:可以根据指定的规则(allow=(正则))进行链接的提取 link = LinkExtractor(allow=r'type=4&page=\d+')#提取页码链接 link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml') rules = ( #规则解析器 #作用:规则解析器可以将链接提取器提取到的链接进行请求发送且进行指定规则(callback)的数据解析 Rule(link, callback='parse_item', follow=False), Rule(link_detail,callback='parse_detail') ) #该方法调用的次数请求的个数 def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/@title').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() detail_url = 'xxxx' # print(title,status) item = SunproItem() item['title'] = title item['status'] = status yield item def parse_detail(self,response): content = response.xpath('/html/body/div[9]/table[2]//tr[1]').extract() content = ''.join(content) # print(content) item = SunproItem_content() item['content'] = content yield item
可以使用多台电脑组件一个分布式机群,让其执行同一组程序,对同一组网络资源进行联合爬取。原生的scrapy是无法实现分布式,调度器无法被共享、管道无法被共享,可以基于scrapy+redis(scrapy&scrapy-redis组件)实现分布式
安装
pip install scrapy-redis
使用流程
创建工程
创建爬虫文件
修改爬虫类
修改配置settings.py
执行工程scrapy runspider spider.py
将起始的url放到可以被共享的调度器的队列中
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy_redis.spiders import RedisCrawlSpider from fbsPro.items import FbsproItem class FbsSpider(RedisCrawlSpider): name = 'fbs' # allowed_domains = ['www.xxx.com'] # start_urls = ['http://www.xxx.com/'] redis_key = 'sun'#可以被共享的调度器队列的名称 link = LinkExtractor(allow=r'type=4&page=\d+') rules = ( Rule(link, callback='parse_item', follow=True), ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/@title').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() item = FbsproItem() item['title'] = title item['status'] = status yield item
# settings #指定管道 ITEM_PIPELINES = {<!-- --> 'scrapy_redis.pipelines.RedisPipeline': 400 } #指定调度器 # 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis组件自己的调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据 # 增量式 SCHEDULER_PERSIST = True #指定redis REDIS_HOST = '192.168.16.64' REDIS_PORT = 6379
检测网站数据更新的情况。爬取到最新更新出来的数据。核心就是去重,在记录表中记录爬取过的信息
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from moviePro.items import MovieproItem class MovieSpider(CrawlSpider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.xxx.tv/frim/index1.html'] conn = Redis(host='127.0.0.1',port=6379) link = LinkExtractor(allow=r'frim/index1-\d+\.html')#提取页码链接 rules = ( Rule(link, callback='parse_item', follow=False), ) def parse_item(self, response): #电影名称+详情页的url li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('./div/a/@title').extract_first() item = MovieproItem() item['name'] = name detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first() ex = self.conn.sadd('movie_record',detail_url) if ex == 1:#这部电影之前没有存在于记录表中 print('有最新更新的数据!!!!!!') yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={<!-- -->'item':item}) else: print('暂无新数据的更新......') def parse_detail(self,response): item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
# pipelines.py class MovieproPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movieData',item) return item