1.读取数据,查看数据基本概况
import pandas as pd
data=pd.read_csv(r'./car_price_prediction.csv')
#查看前5行数据
print(data.head(5))
output:
ID Price Levy ... Wheel Color Airbags
0 45654403 13328 1399 ... Left wheel Silver 12
1 44731507 16621 1018 ... Left wheel Black 8
2 45774419 8467 - ... Right-hand drive Black 2
3 45769185 3607 862 ... Left wheel White 0
4 45809263 11726 446 ... Left wheel Silver 4
[5 rows x 18 columns]
print(data.info)
output:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 19237 non-null int64
1 Price 19237 non-null int64
2 Levy 19237 non-null object
3 Manufacturer 19237 non-null object
4 Model 19237 non-null object
5 Prod. year 19237 non-null int64
6 Category 19237 non-null object
7 Leather interior 19237 non-null object
8 Fuel type 19237 non-null object
9 Engine volume 19237 non-null object
10 Mileage 19237 non-null object
11 Cylinders 19237 non-null float64
12 Gear box type 19237 non-null object
13 Drive wheels 19237 non-null object
14 Doors 19237 non-null object
15 Wheel 19237 non-null object
16 Color 19237 non-null object
17 Airbags 19237 non-null int64
dtypes: float64(1), int64(4), object(13)
#可以看到数据有19237行,有18列,其中没有缺失值
#查看重复的数据
print(data.duplicated().sum())
output:
313
#去除重复列
data.drop_duplicates(inplace=True)
#查看每一列数据的取值情况
for col in data.columns:
print(col)
print(data[col].value_counts())
output:
ID
45815365 8
45815361 8
45815363 7
45815368 7
45723475 7
..
45774312 1
45732621 1
45773011 1
45774019 1
45813273 1
Name: ID, Length: 18924, dtype: int64
------------------------------
Price
15681 280
470 274
14113 244
392 242
314 235
...
42601 1
149 1
54349 1
54954 1
22075 1
Name: Price, Length: 2315, dtype: int64
------------------------------
Levy
- 5819
765 486
891 461
639 410
640 405
...
3156 1
2908 1
1279 1
1719 1
1901 1
Name: Levy, Length: 559, dtype: int64
------------------------------
Manufacturer
HYUNDAI 3769
TOYOTA 3662
MERCEDES-BENZ 2076
FORD 1111
CHEVROLET 1069
...
TESLA 1
PONTIAC 1
SATURN 1
ASTON MARTIN 1
GREATWALL 1
Name: Manufacturer, Length: 65, dtype: int64
------------------------------
Model
Prius 1083
Sonata 1079
Camry 938
Elantra 922
E 350 542
...
Feroza 1
C-MAX C-MAX 1
X1 4X4 1
Land Cruiser Prado RX 1
Prius C aqua 1
Name: Model, Length: 1590, dtype: int64
------------------------------
Prod. year
2012 2155
2014 2124
2013 1963
2011 1612
2015 1549
2010 1483
2016 1476
2017 959
2008 737
2009 601
2018 500
2007 464
2005 402
2003 367
2004 364
2006 317
2019 306
2002 296
2000 279
2001 254
1998 213
1999 207
1997 151
1996 114
1995 105
2020 47
1994 42
1992 30
1993 23
1990 18
1988 12
1991 10
1986 6
1989 6
1987 5
1984 5
1985 5
1953 4
1983 3
1939 3
1978 2
1980 2
1965 2
1977 2
1974 2
1964 2
1943 1
1976 1
1957 1
1968 1
1947 1
1982 1
1981 1
1973 1
Name: Prod. year, dtype: int64
------------------------------
Category
Sedan 8736
Jeep 5473
Hatchback 2847
Minivan 647
Coupe 532
Universal 364
Microbus 306
Goods wagon 233
Pickup 52
Cabriolet 36
Limousine 11
Name: Category, dtype: int64
------------------------------
Leather interior
Yes 13954
No 5283
Name: Leather interior, dtype: int64
------------------------------
Fuel type
Petrol 10150
Diesel 4036
Hybrid 3578
LPG 892
CNG 494
Plug-in Hybrid 86
Hydrogen 1
Name: Fuel type, dtype: int64
------------------------------
Engine volume
2 3916
2.5 2277
1.8 1760
1.6 1462
1.5 1321
...
6.8 1
6.7 1
3.1 1
0.8 Turbo 1
1.1 Turbo 1
Name: Engine volume, Length: 107, dtype: int64
------------------------------
Mileage
0 km 721
200000 km 183
150000 km 161
160000 km 120
100000 km 119
...
63083 km 1
28750 km 1
25077 km 1
77452 km 1
186923 km 1
Name: Mileage, Length: 7687, dtype: int64
------------------------------
Cylinders
4.0 14367
6.0 3462
8.0 991
5.0 169
3.0 107
2.0 42
1.0 38
12.0 38
10.0 12
16.0 5
7.0 4
9.0 1
14.0 1
Name: Cylinders, dtype: int64
------------------------------
Gear box type
Automatic 13514
Tiptronic 3102
Manual 1875
Variator 746
Name: Gear box type, dtype: int64
------------------------------
Drive wheels
Front 12874
4x4 4058
Rear 2305
Name: Drive wheels, dtype: int64
------------------------------
Doors
04-May 18332
02-Mar 777
>5 128
Name: Doors, dtype: int64
------------------------------
Wheel
Left wheel 17753
Right-hand drive 1484
Name: Wheel, dtype: int64
------------------------------
Color
Black 5033
White 4489
Silver 3792
Grey 2375
Blue 1396
Red 639
Green 322
Orange 253
Brown 187
Carnelian red 179
Golden 145
Beige 134
Sky blue 122
Yellow 106
Purple 39
Pink 26
Name: Color, dtype: int64
------------------------------
Airbags
4 5823
12 5654
0 2405
8 1608
6 1311
2 1066
10 849
5 104
16 93
7 86
1 76
9 63
3 37
11 33
14 20
15 7
13 2
Name: Airbags, dtype: int64
------------------------------
可以看到在Levy中‘-’有5819条记录,Engine volume这个特征有些带有单位Turbo,Doors特征带有特殊英文和大于号,Mileage特征带有km单位,Prod. year 特征代表的是汽车是哪一年生产的,需要根据这个计算汽车的已经使用了多少年 。我们将在接下来的数据清洗步骤,解决这些问题。
2.数据清洗
#数据清洗
#Levy是税的意思,'-'应该表示的是没有税,所以应该将'-'替换为0
data['Levy']=data['Levy'].apply(lambda x:x.replace('-','0')).astype(int)
#除去单位km
data['Mileage']= data['Mileage'].str.replace('km','').astype(float)
#提取正确的车门数
data["Doors"]=data["Doors"].apply(lambda x: x.split("-")[0] if "-" in x else x.replace(">", "")).astype(int)
#新增加1列特征表示是否为turbo
data['is_turbo']=data['Engine volume'].apply(lambda x:1 if 'Turbo' in x else 0)
#将Engine volume中带有的turbo单位去掉
data['Engine volume']=data['Engine volume'].str.replace('Turbo','').astype(float)
#查看特征中不同取值的个数
print(data.nunique())
output:
ID 18924
Price 2315
Levy 559
Manufacturer 65
Model 1590
Prod. year 54
Category 11
Leather interior 2
Fuel type 7
Engine volume 107
Mileage 7687
Cylinders 13
Gear box type 4
Drive wheels 3
Doors 3
Wheel 2
Color 16
Airbags 17
is_turbo 2
from date_time import date_time
years_now=datetime.now().year
data[age']=years_now-data['Prod. year']
print(data[['Prod. year','age']].head(5))
output:
Prod. year age
0 2010 14
1 2011 13
2 2006 18
3 2011 13
4 2014 10
#可以看到在数据中有许多特征是类别,需要进行编码转换
category_col=['Manufacturer','Model','Category','Leather interior','Fuel type',
'Gear box type','Drive wheels','Wheel','Color']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in category_col:
data[col]=le.fit_transform(data[col])
#去掉没用的特征
data.drop(columns=['ID','Prod. year'],inplace=True)
#查看是否有异常值,并且删除异常值
num_col=['Price','Levy','age','Engine volume','Mileage','Airbags']
for col in num_col:
q1 = data[col].quantile(0.25)
q3 = data[col].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5*iqr
high = q3 + 1.5*iqr
outlier = ((data[col] > high) | (data[col] < low)).sum()
total_outliers = data[col].shape[0]
print(f"Total Outliars in {col} are : {outlier} : {round(100*(outlier)/total_outliers,2)}%")
if outlier>0:
data=data.loc[(data[col]<=high) & (data[col]>=low)]
output:
Total Outliars in Price are : 1055 : 5.57%
Total Outliars in Levy are : 161 : 0.9%
Total Outliars in age are : 1468 : 8.29%
Total Outliars in Engine volume are : 951 : 5.86%
Total Outliars in Mileage are : 623 : 4.07%
Total Outliars in Airbags are : 0 : 0.0%
3.数据标准化
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
data.dropna(inplace=True)
scaled_df = scaler.fit_transform(data.drop(["Price"],axis=1))
4.建模
X=scaled_df
y=data["Price"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
output:
X_train shape: (10558, 17)
y_train shape: (10558,)
X_val shape: (1174, 17)
y_val shape: (1174,)
X_test shape: (2934, 17)
y_test shape: (2934,)
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error
def models(model):
model.fit(X_train,y_train)
pre=model.predict(X_test)
r2=r2_score(y_test,pre)
R2.append(r2)
rmse=np.sqrt(mean_squared_error(y_test,pre))
RMSE.append(rmse)
score=model.score(X_test,y_test)
#print(f"the score of the model is : {score}")
Algorithms=['LinearRegression','DecisionTreeRegressor','RandomForestRegressor','GradientBoostingClassifier','XGBRegressor','SVR']
R2=[]
RMSE=[]
model1=LinearRegression()
model2=DecisionTreeRegressor(random_state=42)
model3=RandomForestRegressor()
model4=GradientBoostingRegressor()
model5=XGBRegressor()
model6=SVR()
models(model1)
models(model2)
models(model3)
models(model4)
models(model5)
models(model6)
df1=pd.DataFrame({'Algorithms':Algorithms,'R2_score':R2,'RMSE':RMSE})
output:
Algorithms R2_score RMSE
0 LinearRegression 0.233513 10128.813642
1 DecisionTreeRegressor 0.582993 7470.986717
2 RandomForestRegressor 0.772239 5521.352505
3 GradientBoostingClassifier 0.637696 6963.746857
4 XGBRegressor 0.749141 5794.573718
5 SVR -0.014145 11650.813825
5.深度学习模型
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation="linear"))
model.add(Dense(32, activation="linear"))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mean_squared_error"])
result=model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, verbose=0)
# print(model.evaluate())
print(np.sqrt(result.history["val_loss"]))
output:
[18120.89683427 15209.23247943 11820.44869939 10217.87591285
9988.96333788 9967.14313751 9955.55527899 9952.93869747
9956.04175707 9956.86708593 9956.54835753 9952.67119581
9959.13493298 9955.99503163 9955.22404599 9953.39106622
9955.59464297 9956.9463529 9958.24650314 9957.58592009]
plt.figure(figsize=(15,8))
plt.plot(result.history["loss"])
plt.plot(result.history["val_loss"])
plt.title("Loss Graph")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend(["Training", "Validation"])
plt.show()