我自己写的dcd爬虫,这个网站比较简单。看了看别人的程序,觉得用起来挺别扭,就自己捣鼓了一天。弄出来了。
这个网站没有反爬,有一些是动态网页,有一些是静态。
首先,获取销量排行榜前300的车型。
import os
import json
import requests
from parsel import Selector
# ---------------------------------------------------------#
# ---- * 获得车辆销售排行榜前300、100的车 * ----#
# ---------------------------------------------------------#
url = "https://www.dongchedi.com/motor/pc/car/rank_data"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
}
def get_param(page):
params = {
"aid": "1839",
"app_name": "auto_web_pc",
"city_name": "烟台",
"count": "10",
"offset": page,
"month": "",
"new_energy_type": "",
"rank_data_type": "11",
"brand_id": "",
"price": "",
"manufacturer": "",
"outter_detail_type": "",
"nation": "0"
}
return params
def get_response(pageNum):
params = get_param(str(pageNum * 10))
with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
resp.raise_for_status()
print(resp.status_code)
return resp
data_list = []
for i in range(30):
print(f"销量前{i * 10} 的车")
response = get_response(i)
data_list.append(response.json())
获取之后,就能访问该车型,一般一个车型有好多款式,我的目的是向比较一些车型的尺寸,所以一个车型就选第一种款式,访问进入该车型第一种款式的参数配置,这样把参数下载下来,放到一个文件里,就可以比较现在卖的车的尺寸情况。
第二部分,我尝试了一下动态请求车型的价格。不过这一部分后面数据分析没有用到。
len(data_list)
import jsonpath
data_list[0]['data']['list'][0]['series_name']
name_list = jsonpath.jsonpath(data_list, "$..series_name")
id_list = jsonpath.jsonpath(data_list, "$..series_id")
id_list
first_list = jsonpath.jsonpath(data_list, "$..online_car_ids")
first_list[0][0]
car_id_list = []
for ls in first_list:
if ls:
first_id = ls[0]
else:
first_id = None
car_id_list.append(first_id)
len(car_id_list)
import pandas as pd
df = pd.DataFrame({
"name": name_list,
"series": id_list,
"first_id":car_id_list
})
df
df[df['first_id'] == None]
df2 = df.dropna()
df.shape
df2.shape
df2.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df = pd.read_csv("Pythn-Anlys-138/dcd/top300cars.csv")
df.keys()
df.columns
df.columns = ['rank', 'name', 'series', 'first_id']
df.to_csv("Pythn-Anlys-138/dcd/top300cars.csv")
# ---------------------------------------------------------#
# ---- * 价格 * ----#
# ---------------------------------------------------------#
first_id
def get_price(car_id):
import json
import os
wk_dir = "Pythn-Anlys-138/dcd"
# fpath = wk_dir + "/" + car_id + ".csv"
fname = car_id + ".json"
url = "https://www.dongchedi.com/motor/pc/car/series/car_dealer_price"
headers = {
。。。
}
params = {
"aid": "1839",
"app_name": "auto_web_pc",
"car_ids": car_id,
"city_name": "烟台"
}
with requests.get(url=url, headers=headers, params=params, verify=False) as resp:
resp.raise_for_status()
# print(resp.json())
rj = resp.json()
with open(os.path.join(wk_dir, fname), 'w', encoding="utf-8") as f:
f.write(json.dumps(rj, ensure_ascii=False))
print(f"保存文件成功 {car_id} !!!")
first_id = str(first_id)
get_price(first_id)
这一部分呢后期没什么用,代码也很乱。
第三部分,获取某一车型的第一种款式的参数。
# ---------------------------------------------------------#
# ---- * 参数配置 * ----#
# ---------------------------------------------------------#
from parsel import Selector
def get_detail_page(id):
url = "https://www.dongchedi.com/auto/params-carIds-" + id
headers = {
。。。
}
with requests.get(url=url, headers=headers, verify=False) as resp:
resp.raise_for_status()
# print(resp.text)
return resp.text
html = get_detail_page(id)
html
selector = Selector(html)
selector.css('div[data-row-anchor]')
len(selector.css('div[data-row-anchor]'))
all_rows = selector.css('div[data-row-anchor]')
dct_list = []
for row in all_rows:
dct_item = {}
label = row.css('div:nth-child(1) label::text').get()
value = row.css('div:nth-child(2) div::text').get()
dct_item[label] = value
dct_list.append(dct_item)
dct_list
first_row = all_rows[0]
def parse_detail(id):
html = get_detail_page(id)
selector = Selector(html)
all_rows = selector.css('div[data-row-anchor]')
dct_list = []
for row in all_rows:
dct_item = {}
label = row.css('div:nth-child(1) label::text').get()
value = row.css('div:nth-child(2) div::text').get()
dct_item[label] = value
dct_list.append(dct_item)
dct_detail = {
"id":id,
"detail":dct_list
}
return dct_detail
dct_detail = parse_detail(id)
dct_detail
first_id_list
def save_detail(id, dct_detail):
fname = id + "_dcd_detail.json"
with open(os.path.join("Pythn-Anlys-138/dcd", fname), 'w', encoding='utf8') as f:
f.write(json.dumps(dct_detail, ensure_ascii=False))
print(f"Detail file {id} saved!!!")
for fid in first_id_list:
dct_detail = parse_detail(fid)
save_detail(fid, dct_detail)
最后,下载了一些json文件。后期做了一些数据整理。做成了数据表是这样的。
结果还不错。