一、分布式爬虫简述
(一)分布式爬虫优势
1.充分利用多台机器的带宽速度
2.充分利用多台机器的ip地址
(二)Redis数据库
1.Redis是一个高性能的nosql数据库
2.Redis的所有操作都是原子性的
3.Redis的数据类型都是基于基本数据结构,无需额外的抽象
4.Redis五种数据类型:string、hash、list、set、zset(sorted set)
(三)python操作redis数据库
1.终端:pip install redis
2.代码如下
import redis
db = redis.Redis(host="localhost", port="6379", decode_responses=True)
# 如果用到相同的key值,可以自动修改
db.set("name", "Sam")
db.set("name2", "张三")
print(db.get("name2"))
# 多个值
db.mset({"k1":"v1","k2":"v2"})
print(db.mget("k1","k2","name2"))
# hash
db.hset("hash1","hkey1","hvalue1")
db.hset("hash1","hkey2","hvalue2")
db.hset("hash1","hkey3","hvalue3")
print(db.hget("hash1","hkey2"))
print(db.hgetall("hash1"))
db.lpush("list1",11,22,33)
print(db.llen("list1"))
print(db.lrange("list1",0,-1))
db.sadd("set1", 55, 44 ,77)
print(db.scard("set1"))
print(db.smembers("set1"))
db.zadd("zset1",{"item1":1,"item2":2,"item3":2})
print(db.zcard("zset1"))
print(db.zrange("zset1",0,-1))
print(db.zrange("zset1",0,-1,withscores=True))
(四)Redis数据保存至mongodb数据库
import redis
import pymongo
import json
db_redis = redis.Redis(host="localhost", port="6379", decode_responses=True)
client_mongo = pymongo.MongoClient("mongodb://localhost:27017")
db_mongo = client_mongo["RedisToMongo"]
col_mongo = db_mongo["C1"]
for i in db_redis.lrange("app:items", 0 -1):
page = {
"title":json.loads(i)["title"]
}
res = col_mongo.insert_one(page)
print(res.inserted_id)
二、分布式爬虫实战
实现一个简单的分布式:
1.创建爬虫项目和文件同scrapy一样的步骤
2.修改settings.py文件中的user-agent、robotstxt_obey、log_level、打开注释掉的item_piplines
3.终端安装scrapy-redis:pip install scrapy-redis
4.在app.py文件中修改如下代码:
import scrapy
from ..items import C07L07Item
from scrapy_redis.spiders import RedisSpider
class AppSpider(RedisSpider):
name = "app"
redis_key = "app"
# start_urls = ["http://127.0.0.1:5000/C07L07"]
def __init__(self, *args, **kwargs):
domain = kwargs.pop("domain","")
self.allowed_domains = filter(None, domain.split(","))
super(AppSpider, self).__init__(*args, **kwargs)
def parse(self, response):
links = response.xpath('//a/@href').getall()
for link in links:
link = "http://127.0.0.1:5000"+link
yield scrapy.Request(url=link,callback=self.parse_details, dont_filter=True)
def parse_details(self, response):
item = C07L07Item()
item["title"] = response.text
yield item
import scrapy
import re
from scrapy_redis.spiders import RedisSpider
from ..items import C07L09Item
class AppSpider(RedisSpider):
name = "app"
redis_key = "jingdong"
# allowed_domains = ["list.jd.com"]
# start_urls = ["https://list.jd.com/list.html?cat=1713%2C3258&ev=2953_75727%5E&page=1&s=1&click=0"]
def __init__(self, *args, **kwargs):
domain = kwargs.pop("domain","")
self.allowed_domains = filter(None, domain.split(","))
super(AppSpider, self).__init__(*args, **kwargs)
def parse(self, response):
item = C07LO9Item()
books = response.xpath('//*[@id="J_goodslist"]/ul/li')
if len(book) != 0:
for book in books:
item["title"] = book.xpath('.//div[@class="p-name"]//em/text()').get()
item["price"] = book.xpath('.//div[@class="p-name"]//i/text()').get()
yield item
exp = re.compile('page=(\d*?)&s=(\d*?)&')
result = exp.findall(response.url)[0]
page = str(int(result[0] +1)
s = str(int(result[1] +30)
next_url = "https://list.jd.com/list.html?cat=1713%2C3258&ev=2953_75727%5E&page={}&s={}&click=0".format(page,s)
print(next_url)
yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)
在items.py文件中修改数据结构
import scrapy
class C07L07Item(scrapy.Item):
title = scrapy.Field()
在pipelines.py文件中修改代码
from itemdapter import ItemAdapter
class C07L07Pipeline:
def process_item(self, item, spider):
print(item["title"])
return item
5.在settings.py文件中添加如下代码,修改ITEM_PIPELINES
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
"C07LO7.pipelines.C07LO7Pipeline":300,
"scrapy_redis.pipelines.RedisPipeline":400
}
6.在终端链接redis数据库:redis-cli
lpush app http://127.0.0.1:5000/C07L07
7.运行爬虫代码:scrapy crawl app(可以开多进程)