流程
- 导入所要使用的包
- 引入kaggle的数据集csv文件
- 查看数据集有无空值
- 填充这些空值
- 提取特征
- 分离训练集和测试集
- 调用模型
数据资源获取
数据资源获取
导入需要的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
引入kaggle的数据集csv文件
数据集有五千万行,就我的笔记本而言,还是用前50万条数据训练个模型
train = pd.read_csv('train1.csv',nrows=50000)
test = pd.read_csv('test.csv')
test_ids = test['key']
train.head()
查看数据集有无空值
train.isnull().sum()#找出空值
空值处理
50万条出现了怎么几条数据,直接删除
# 删除包含空值的行
train = train.dropna(how='any', axis=0)
看一下test中的数据
test = pd.read_csv('test.csv')
test_ids = test['key']
test.head()
test.isnull().sum()
#没有空数据
特征操作
- 车费的价格肯定要>0
train = train[train.fare_amount>=0]
- 看一下坐标的范围
# 查看坐标范围
print(min(test.pickup_longitude.min(),test.dropoff_longitude.min()))
print(max(test.pickup_longitude.max(),test.dropoff_longitude.max()))
print(min(test.pickup_latitude.min(),test.dropoff_latitude.min()))
print(max(test.pickup_latitude.max(),test.dropoff_latitude.max()))
#-74.263242
#-72.986532
#40.568973
#41.709555
def select_train(df, fw):
return (df.pickup_longitude >= fw[0]) & (df.pickup_longitude <= fw[1]) & \
(df.pickup_latitude >= fw[2]) & (df.pickup_latitude <= fw[3]) & \
(df.dropoff_longitude >= fw[0]) & (df.dropoff_longitude <= fw[1]) & \
(df.dropoff_latitude >= fw[2]) & (df.dropoff_latitude <= fw[3])
fw = (-74.2, -73, 40.5, 41.8)
train = train[select_train(train, fw)]
- 按照时间提取特征
# 根据时间提取新的特征
def deal_time_features(df):
df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
df['hour'] = df.pickup_datetime.dt.hour
df['month'] = df.pickup_datetime.dt.month
df["year"] = df.pickup_datetime.dt.year
df["weekday"] = df.pickup_datetime.dt.weekday
return df
train = deal_time_features(train)
test = deal_time_features(test)
- 根据坐标计算距离
# 根据坐标转换为距离
def distance(x1, y1, x2, y2):
p = 0.017453292519943295
a = 0.5 - np.cos((x2 - x1) * p)/2 + np.cos(x1 * p) * np.cos(x2 * p) * (1 - np.cos((y2 - y1) * p)) / 2
dis = 0.6213712 * 12742 * np.arcsin(np.sqrt(a))
return dis
train['distance_miles'] = distance(train.pickup_latitude,train.pickup_longitude,train.dropoff_latitude,train.dropoff_longitude)
test['distance_miles'] = distance(test.pickup_latitude, test.pickup_longitude,test.dropoff_latitude,test.dropoff_longitude)
train.head()
- 去除票价和距离为0的数据
train = train.drop(index= train[(train['distance_miles']==0)&(train['fare_amount']==0)].index, axis=0)
- 删除fare_amount小于2.5的数据,因为纽约出租车的起步价为2.5
train = train.drop(index= train[train['fare_amount'] < 2.5].index, axis=0)
- 去除人数大于7的数据
train = train.drop(index= train[train.passenger_count >= 7].index, axis=0)
- 删除没有用的数据
train = train.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
test = test.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
- 看特征和标签的关联度
#看一下特征和价格的关联程度
train.corr()['fare_amount']
调用模型
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
x ,r = train[train.columns.delete(0)], train['fare_amount']
linear_model = LinearRegression()
linear_model.fit(x, r)
prediction = linear_model.predict(test)
res = pd.DataFrame()
res['key'] = test_ids
res['fare_amount'] = prediction
res.to_csv('submission.csv', index=False)
#结果保存