"""
爬取百度贴吧,段友之家的图片和视频
author: cuizy
time:2018-05-19
"""
import requests
import bs4
import os
def write_file(file_url, file_type):
"""写入文件"""
res = requests.get(file_url)
res.raise_for_status()
if file_type == 1:
file_folder = 'nhdz\\jpg'
elif file_type == 2:
file_folder = 'nhdz\\mp4'
else:
file_folder = 'nhdz\\other'
folder = os.path.exists(file_folder)
if not folder:
os.makedirs(file_folder)
file_name = os.path.basename(file_url)
str_index = file_name.find('?')
if str_index > 0:
file_name = file_name[:str_index]
file_path = os.path.join(file_folder, file_name)
print('正在写入资源文件:', file_path)
image_file = open(file_path, 'wb')
for chunk in res.iter_content(100000):
image_file.write(chunk)
image_file.close()
print('写入完成!')
def download_file(web_url):
"""获取资源的url"""
print('正在下载网页: %s...' % web_url)
result = requests.get(web_url)
soup = bs4.BeautifulSoup(result.text, "html.parser")
img_list = soup.select('.vpic_wrap img')
if img_list == []:
print('未发现图片资源!')
else:
for img_info in img_list:
file_url = img_info.get('bpic')
write_file(file_url, 1)
video_list = soup.select('.threadlist_video a')
if video_list == []:
print('未发现视频资源!')
else:
for video_info in video_list:
file_url = video_info.get('data-video')
write_file(file_url, 2)
print('下载资源结束:', web_url)
next_link = soup.select('#frs_list_pager .next')
if next_link == []:
print('下载资料结束!')
else:
url = next_link[0].get('href')
download_file('https:' + url)
if __name__ == '__main__':
web_url = 'https://tieba.baidu.com/f?ie=utf-8&kw=段友之家'
download_file(web_url)