kaggel-汽车价格预测项目

1.读取数据，查看数据基本概况

import pandas  as pd
data=pd.read_csv(r'./car_price_prediction.csv')

#查看前5行数据
print(data.head(5))

output:
             ID  Price  Levy  ...             Wheel   Color  Airbags
0  45654403  13328  1399  ...        Left wheel  Silver       12
1  44731507  16621  1018  ...        Left wheel   Black        8
2  45774419   8467     -  ...  Right-hand drive   Black        2
3  45769185   3607   862  ...        Left wheel   White        0
4  45809263  11726   446  ...        Left wheel  Silver        4

[5 rows x 18 columns]

print(data.info)

output:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object 
 17  Airbags           19237 non-null  int64  
dtypes: float64(1), int64(4), object(13)

#可以看到数据有19237行，有18列，其中没有缺失值

#查看重复的数据
print(data.duplicated().sum())

output：
    313
#去除重复列
data.drop_duplicates(inplace=True)

#查看每一列数据的取值情况
for col in  data.columns:
    print(col)
    print(data[col].value_counts())

output:
    ID
45815365    8
45815361    8
45815363    7
45815368    7
45723475    7
           ..
45774312    1
45732621    1
45773011    1
45774019    1
45813273    1
Name: ID, Length: 18924, dtype: int64
------------------------------
Price
15681    280
470      274
14113    244
392      242
314      235
        ... 
42601      1
149        1
54349      1
54954      1
22075      1
Name: Price, Length: 2315, dtype: int64
------------------------------
Levy
-       5819
765      486
891      461
639      410
640      405
        ... 
3156       1
2908       1
1279       1
1719       1
1901       1
Name: Levy, Length: 559, dtype: int64
------------------------------
Manufacturer
HYUNDAI          3769
TOYOTA           3662
MERCEDES-BENZ    2076
FORD             1111
CHEVROLET        1069
                 ... 
TESLA               1
PONTIAC             1
SATURN              1
ASTON MARTIN        1
GREATWALL           1
Name: Manufacturer, Length: 65, dtype: int64
------------------------------
Model
Prius                    1083
Sonata                   1079
Camry                     938
Elantra                   922
E 350                     542
                         ... 
Feroza                      1
C-MAX C-MAX                 1
X1 4X4                      1
Land Cruiser Prado RX       1
Prius C aqua                1
Name: Model, Length: 1590, dtype: int64
------------------------------
Prod. year
2012    2155
2014    2124
2013    1963
2011    1612
2015    1549
2010    1483
2016    1476
2017     959
2008     737
2009     601
2018     500
2007     464
2005     402
2003     367
2004     364
2006     317
2019     306
2002     296
2000     279
2001     254
1998     213
1999     207
1997     151
1996     114
1995     105
2020      47
1994      42
1992      30
1993      23
1990      18
1988      12
1991      10
1986       6
1989       6
1987       5
1984       5
1985       5
1953       4
1983       3
1939       3
1978       2
1980       2
1965       2
1977       2
1974       2
1964       2
1943       1
1976       1
1957       1
1968       1
1947       1
1982       1
1981       1
1973       1
Name: Prod. year, dtype: int64
------------------------------
Category
Sedan          8736
Jeep           5473
Hatchback      2847
Minivan         647
Coupe           532
Universal       364
Microbus        306
Goods wagon     233
Pickup           52
Cabriolet        36
Limousine        11
Name: Category, dtype: int64
------------------------------
Leather interior
Yes    13954
No      5283
Name: Leather interior, dtype: int64
------------------------------
Fuel type
Petrol            10150
Diesel             4036
Hybrid             3578
LPG                 892
CNG                 494
Plug-in Hybrid       86
Hydrogen              1
Name: Fuel type, dtype: int64
------------------------------
Engine volume
2            3916
2.5          2277
1.8          1760
1.6          1462
1.5          1321
             ... 
6.8             1
6.7             1
3.1             1
0.8 Turbo       1
1.1 Turbo       1
Name: Engine volume, Length: 107, dtype: int64
------------------------------
Mileage
0 km         721
200000 km    183
150000 km    161
160000 km    120
100000 km    119
            ... 
63083 km       1
28750 km       1
25077 km       1
77452 km       1
186923 km      1
Name: Mileage, Length: 7687, dtype: int64
------------------------------
Cylinders
4.0     14367
6.0      3462
8.0       991
5.0       169
3.0       107
2.0        42
1.0        38
12.0       38
10.0       12
16.0        5
7.0         4
9.0         1
14.0        1
Name: Cylinders, dtype: int64
------------------------------
Gear box type
Automatic    13514
Tiptronic     3102
Manual        1875
Variator       746
Name: Gear box type, dtype: int64
------------------------------
Drive wheels
Front    12874
4x4       4058
Rear      2305
Name: Drive wheels, dtype: int64
------------------------------
Doors
04-May    18332
02-Mar      777
>5          128
Name: Doors, dtype: int64
------------------------------
Wheel
Left wheel          17753
Right-hand drive     1484
Name: Wheel, dtype: int64
------------------------------
Color
Black            5033
White            4489
Silver           3792
Grey             2375
Blue             1396
Red               639
Green             322
Orange            253
Brown             187
Carnelian red     179
Golden            145
Beige             134
Sky blue          122
Yellow            106
Purple             39
Pink               26
Name: Color, dtype: int64
------------------------------
Airbags
4     5823
12    5654
0     2405
8     1608
6     1311
2     1066
10     849
5      104
16      93
7       86
1       76
9       63
3       37
11      33
14      20
15       7
13       2
Name: Airbags, dtype: int64
------------------------------

可以看到在Levy中‘-’有5819条记录,Engine volume这个特征有些带有单位Turbo，Doors特征带有特殊英文和大于号，Mileage特征带有km单位,Prod. year 特征代表的是汽车是哪一年生产的，需要根据这个计算汽车的已经使用了多少年。我们将在接下来的数据清洗步骤，解决这些问题。

2.数据清洗

#数据清洗
#Levy是税的意思，'-'应该表示的是没有税，所以应该将'-'替换为0

data['Levy']=data['Levy'].apply(lambda x:x.replace('-','0')).astype(int)

#除去单位km
data['Mileage']= data['Mileage'].str.replace('km','').astype(float)

#提取正确的车门数
data["Doors"]=data["Doors"].apply(lambda x: x.split("-")[0] if "-" in x else x.replace(">", "")).astype(int)

#新增加1列特征表示是否为turbo
data['is_turbo']=data['Engine volume'].apply(lambda x:1 if 'Turbo' in x else 0)
#将Engine volume中带有的turbo单位去掉
data['Engine volume']=data['Engine volume'].str.replace('Turbo','').astype(float)
#查看特征中不同取值的个数
print(data.nunique())
output:
ID                  18924
Price                2315
Levy                  559
Manufacturer           65
Model                1590
Prod. year             54
Category               11
Leather interior        2
Fuel type               7
Engine volume         107
Mileage              7687
Cylinders              13
Gear box type           4
Drive wheels            3
Doors                   3
Wheel                   2
Color                  16
Airbags                17
is_turbo                2

from date_time import date_time

years_now=datetime.now().year

data[age']=years_now-data['Prod. year']

print(data[['Prod. year','age']].head(5))

output:
       Prod. year  age
0        2010       14
1        2011       13
2        2006       18
3        2011       13
4        2014       10

#可以看到在数据中有许多特征是类别，需要进行编码转换

category_col=['Manufacturer','Model','Category','Leather interior','Fuel type',
         'Gear box type','Drive wheels','Wheel','Color']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in category_col:
    data[col]=le.fit_transform(data[col])

#去掉没用的特征
data.drop(columns=['ID','Prod. year'],inplace=True)

#查看是否有异常值,并且删除异常值

num_col=['Price','Levy','age','Engine volume','Mileage','Airbags']

for col in num_col:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    low = q1 - 1.5*iqr
    high = q3 + 1.5*iqr
    outlier = ((data[col] > high) | (data[col] < low)).sum()
    total_outliers = data[col].shape[0]
    print(f"Total Outliars in {col} are : {outlier} : {round(100*(outlier)/total_outliers,2)}%")
    if outlier>0:
        data=data.loc[(data[col]<=high) & (data[col]>=low)]
output:

Total Outliars in Price are : 1055 : 5.57%
Total Outliars in Levy are : 161 : 0.9%
Total Outliars in age are : 1468 : 8.29%
Total Outliars in Engine volume are : 951 : 5.86%
Total Outliars in Mileage are : 623 : 4.07%
Total Outliars in Airbags are : 0 : 0.0%

3.数据标准化

from sklearn.preprocessing import  StandardScaler

scaler=StandardScaler()
data.dropna(inplace=True)
scaled_df = scaler.fit_transform(data.drop(["Price"],axis=1))

4.建模

X=scaled_df
y=data["Price"]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)

print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

output:

X_train shape:  (10558, 17)
y_train shape:  (10558,)
X_val shape:  (1174, 17)
y_val shape:  (1174,)
X_test shape:  (2934, 17)
y_test shape:  (2934,)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.metrics import r2_score,mean_squared_error

def models(model):
    model.fit(X_train,y_train)
    pre=model.predict(X_test)
    r2=r2_score(y_test,pre)
    R2.append(r2)
    rmse=np.sqrt(mean_squared_error(y_test,pre))
    RMSE.append(rmse)
    score=model.score(X_test,y_test)
    #print(f"the score of the model is : {score}")

Algorithms=['LinearRegression','DecisionTreeRegressor','RandomForestRegressor','GradientBoostingClassifier','XGBRegressor','SVR']
R2=[]
RMSE=[]
model1=LinearRegression()
model2=DecisionTreeRegressor(random_state=42)
model3=RandomForestRegressor()
model4=GradientBoostingRegressor()
model5=XGBRegressor()
model6=SVR()
models(model1)
models(model2)
models(model3)
models(model4)
models(model5)
models(model6)
df1=pd.DataFrame({'Algorithms':Algorithms,'R2_score':R2,'RMSE':RMSE})

output:

                   Algorithms  R2_score          RMSE
0            LinearRegression  0.233513  10128.813642
1       DecisionTreeRegressor  0.582993   7470.986717
2       RandomForestRegressor  0.772239   5521.352505
3  GradientBoostingClassifier  0.637696   6963.746857
4                XGBRegressor  0.749141   5794.573718
5                         SVR -0.014145  11650.813825

5.深度学习模型

from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation="linear"))
model.add(Dense(32, activation="linear"))
model.add(Dense(1, activation="linear"))
model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mean_squared_error"])
result=model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, verbose=0)
# print(model.evaluate())
print(np.sqrt(result.history["val_loss"]))

output:
    [18120.89683427 15209.23247943 11820.44869939 10217.87591285
  9988.96333788  9967.14313751  9955.55527899  9952.93869747
  9956.04175707  9956.86708593  9956.54835753  9952.67119581
  9959.13493298  9955.99503163  9955.22404599  9953.39106622
  9955.59464297  9956.9463529   9958.24650314  9957.58592009]

plt.figure(figsize=(15,8))
plt.plot(result.history["loss"])
plt.plot(result.history["val_loss"])
plt.title("Loss Graph")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend(["Training", "Validation"])
plt.show()