介绍
由于最近在学习Ruby
,写一个爬虫锻炼一下。涉及xml
解析、多线程、xpath
语法等基础知识。
实现代码
使用说明
使用前请先安装如下gem
gem install nokogiri http openssl
# nokogiri:一个解析xml和html的库,支持css、xpath语法
# http:一个发送http请求的库
源代码
require 'nokogiri'
require 'openssl'
require 'time'
require 'http'
require 'thread'
# 由于网站涉不良内容,网站已编码,自行研究解码方式
BASE_URL = 'l5VKR[9`aI10.P;m*LzIh,]@P17&0^F'
# AES-128-CBC解密 ,网站图片有加密,需要解密
def aes_128_cbc_decrypt(encrypted_data, key = 'f5d965df75336270', iv = '97b60394abc2fbe1')
aes = OpenSSL::Cipher.new('aes-128-cbc')
aes.decrypt
aes.key = key
aes.iv = iv
aes.padding = 0 # 禁用填充
aes.update(encrypted_data) + aes.final
end
# 获取页面
def get_page_doc(page_url)
begin # 使用HTTP.follow自动跟随重定向
resp = HTTP.follow.get(page_url)
# 转换为doc
doc = Nokogiri::HTML(resp.body.to_s)
rescue Exception => e
puts e.message
end
doc
end
# 获取列表页面
def fetch_list_urls(doc)
page_list = []
urls = []
infos = []
# 获取页面链接地址
doc.xpath('//*[@id="archive"]/article/a/@href').each do |link|
# 添加
urls << BASE_URL + link
end
# 匹配标题及发布时间
doc.xpath('//*[@class="post-card"]/div[2]/div').each do |title|
info = {}
if title.content.gsub(/\s+/,'')!=''
# 获取标题
t = title.xpath('h2[@class="post-card-title"]/text()')[0].content
# 获取发布时间
time_str = title.xpath('div[@class="post-card-info"]/span[2]/@content')[0].content
publish_time = Time.parse(time_str).strftime('%Y/%m/%d')
info['title'] , info['publish_time']= t ,publish_time
infos << info
else
# 内容为空的都为广告
info['title'], info['publish_time'] = '',''
infos << info
end
end
# 转换hash对象
urls.each_with_index do |url, i|
page= {'url' => url,'title'=>infos[i]['title'],'publish_time'=> infos[i]['publish_time']}
page_list << page
end
# 返回page_list
page_list
end
# 获取某一页的图片
def fetch_page(title,page_url)
doc = get_page_doc(page_url)
# 去除特殊字符,不然创建目录会失败,windows环境
title = title.gsub(/[“”:、\-*<>?\|\/?!!\s]*/,'')
# filename = "images/#{title}"
filename = File.join(File.dirname($0), "images/#{title}")
unless doc.nil?
# 创建目录
Dir.mkdir(filename) unless Dir.exist?(filename)
# 匹配页面中的图片
urls = doc.xpath('//*[@itemprop="articleBody"]/p/img/@data-xkrkllgl')
# 将url添加进队列
work_queue = Queue.new
urls.each { |img_url| work_queue << img_url }
workers = (1..urls.size).map do |i|
Thread.new(i) do
begin
while (img_url = work_queue.pop(true))
begin
p "下载图片:#{img_url.content}"
# 读取图片数据,设置超时时间为3s
raw_data = HTTP.timeout(3).get(img_url.content).body.to_s
sleep 0.1
# 解密保存
raw_data = aes_128_cbc_decrypt(raw_data)
File.binwrite("#{filename}/image#{i}.jpg", raw_data)
rescue Exception => e
p e.message
next
end
end
rescue ThreadError
end
end
end
workers.map(&:join)
end
end
def start_crawl
page_index = 1
loop do
begin
url = "#{BASE_URL}category/wpcz/#{page_index}/" # 今日吃瓜页面
p "正在抓取#{page_index}页,地址:#{url}"
doc = get_page_doc(url)
fetch_list_urls(doc).each do |page|
fetch_page(page['title'],page['url'])
end
# 匹配下一页按钮
next_page_xpath = '//*[@class="page-navigator"]/ol/li[@class="btn btn-primary next"]/a/text()'
# 退出抓取的条件
break if doc.xpath(next_page_xpath)[0].content != "下一页"
# 抓取下一页
page_index += 1
sleep 0.1
rescue Exception => e
p e.message
page_index += 1
next
end
end
end
# 执行抓取方法
if __FILE__==$0
start_crawl
end
本文由【产品经理不是经理】gzh 同步发布,欢迎关注