kaggle（4） Regression with an Abalone Dataset 鲍鱼数据集的回归

在这里插入图片描述

import pandas as pd
import numpy as np

import xgboost
import lightgbm
import optuna
import catboost

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

PROJECT DESCRIPTION

Predicting the age of abalone from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope

PHYSICAL ATTRIBUTES

SEX: Male/Female/Infant
LENGTH: Longest shell measurement
DIAMETER: Diameter of the Abalone
HEIGHT: Height of the Abalone
WHOLE WEIGHT: Weight of the whole abalone
SHUCKED WEIGHT: Weight of the meat
VISCERA WEIGHT: Gut Weight - Interal Organs
SHELL WEIGHT: Shell Weight after drying
RINGS: Number of rings +1.5 gives Age of the Abalone

项目介绍

通过物理测量预测鲍鱼的年龄。鲍鱼的年龄是通过将鲍鱼壳从锥体上切开、染色并通过显微镜计算环数来确定的。

物理属性

性别：男/女/婴儿
长度：最长外壳测量值
直径：鲍鱼的直径
高度：鲍鱼的高度
整体重量：整个鲍鱼的重量
去壳重量：肉的重量
内脏重量：肠道重量 - 内脏器官
壳重：干燥后的壳重
环：环数+1.5给出鲍鱼的年龄

Load the Datasets 加载数据集

# original = pd.read_csv("/kaggle/input/abalone-dataset/abalone.csv")
# train = pd.read_csv("/kaggle/input/playground-series-s4e4/train.csv")
# test = pd.read_csv("/kaggle/input/playground-series-s4e4/test.csv")
original = pd.read_csv("./data/abalone.csv")
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

Make the data ready for tuning 准备好数据进行调整

train = train.drop("id", axis=1)
train=train.rename(columns={'Whole weight':'Whole weight','Whole weight.1':'Shucked weight', 'Whole weight.2':'Viscera weight', 'Shell weight':'Shell weight'})
test=test.rename(columns={'Whole weight':'Whole weight','Whole weight.1':'Shucked weight', 'Whole weight.2':'Viscera weight', 'Shell weight':'Shell weight'})

train = pd.concat([train, original], axis=0)

Get familier with the Data 熟悉数据

train.head()

	Sex	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Rings
0	F	0.550	0.430	0.150	0.7715	0.3285	0.1465	0.2400	11
1	F	0.630	0.490	0.145	1.1300	0.4580	0.2765	0.3200	11
2	I	0.160	0.110	0.025	0.0210	0.0055	0.0030	0.0050	6
3	M	0.595	0.475	0.150	0.9145	0.3755	0.2055	0.2500	10
4	I	0.555	0.425	0.130	0.7820	0.3695	0.1600	0.1975	9

print(f"The shape of training dataset is : {train.shape}")
print(f"The shape of testing dataset is : {test.shape}")

The shape of training dataset is : (94792, 9)
The shape of testing dataset is : (60411, 9)

test.head()

	id	Sex	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight
0	90615	M	0.645	0.475	0.155	1.2380	0.6185	0.3125	0.3005
1	90616	M	0.580	0.460	0.160	0.9830	0.4785	0.2195	0.2750
2	90617	M	0.560	0.420	0.140	0.8395	0.3525	0.1845	0.2405
3	90618	M	0.570	0.490	0.145	0.8740	0.3525	0.1865	0.2350
4	90619	I	0.415	0.325	0.110	0.3580	0.1575	0.0670	0.1050

train.groupby("Sex").count()["Length"]

Sex
F    27802
I    34435
M    32555
Name: Length, dtype: int64

test.groupby("Sex").count()["Length"]

Sex
F    17387
I    22241
M    20783
Name: Length, dtype: int64

np.sort(pd.unique(train.Rings))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29], dtype=int64)

View the Distribution 查看分布

train.hist(figsize=(12, 10), grid=True, bins=50)
plt.tight_layout()
plt.axis("off")

(0.0, 1.0, 0.0, 1.0)

在这里插入图片描述

test.hist(figsize=(12, 10), grid=True, bins=50)
plt.tight_layout()
plt.axis("off")

(0.0, 1.0, 0.0, 1.0)

在这里插入图片描述

CONTINUOUS COLUMN ANALYSIS 连续柱分析

# Set up warnings to be ignored (optional)
warnings.filterwarnings("ignore")
pd.set_option('mode.use_inf_as_na', False)

train_str = train
train_str['Rings'] = train_str['Rings'].astype(str)

# List of continuous variables in your dataset
continuous_vars = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']

# Set hue to your target column
target_column = 'Rings'

for column in continuous_vars:
    fig, axes = plt.subplots(1, 2, figsize=(18, 4))  # Create subplots with 1 row and 2 columns
    
    # Plot histogram with hue and explicit labels
    sns.histplot(data=train_str, x=column, hue=target_column, bins=50, kde=True, ax=axes[0], palette='muted', legend=False)
    axes[0].set_title(f'Histogram of {column} with {target_column} Hue')
    axes[0].set_xlabel(column)
    axes[0].set_ylabel('Count')
    axes[0].legend(title=target_column, loc='upper right')

    # Plot KDE plot with hue and explicit labels
    sns.kdeplot(data=train_str, x=column, hue=target_column, ax=axes[1], palette='muted', legend=False)
    axes[1].set_title(f'KDE Plot of {column} with {target_column} Hue')
    axes[1].set_xlabel(column)
    axes[1].set_ylabel('Density')
    axes[1].legend(title=target_column, loc='upper right')
    
    plt.tight_layout()  # Adjust spacing between subplots
    plt.show()

在这里插入图片描述

ANALYSIS BY QQ PLOT QQ图分析

import scipy.stats as stats  
def qq_plot_with_skewness(data, quantitative_var):
    # Check if the variable is present in the DataFrame
    if quantitative_var not in data.columns:
        print(f"Error: '{quantitative_var}' not found in the DataFrame.")
        return
    
    f, ax = plt.subplots(1, 2, figsize=(18, 5.5))

    # Check for missing values
    if data[quantitative_var].isnull().any():
        print(f"Warning: '{quantitative_var}' contains missing values. Results may be affected.")

    # QQ plot
    stats.probplot(data[quantitative_var], plot=ax[0], fit=True)
    ax[0].set_title(f'QQ Plot for {quantitative_var}')

    # Skewness plot
    sns.histplot(data[quantitative_var], kde=True, ax=ax[1])
    ax[1].set_title(f'Distribution of {quantitative_var}')

    # Calculate skewness value
    skewness_value = stats.skew(data[quantitative_var])

    # Display skewness value on the plot
    ax[1].text(0.5, 0.5, f'Skewness: {skewness_value:.2f}', transform=ax[1].transAxes, 
               horizontalalignment='center', verticalalignment='center', fontsize=16, color='red')

    plt.show()
# Example usage for each continuous variable
for var in continuous_vars:
    qq_plot_with_skewness(train, var)

在这里插入图片描述

Split the Dataset 分割数据集

sex_to_num = {
    "M": 0,
    "F": 1,
    "I": 2
}

train["Sex"] = train["Sex"].replace(sex_to_num.keys(), [sex_to_num[key] for key in sex_to_num])
test["Sex"] = test["Sex"].replace(sex_to_num.keys(), [sex_to_num[key] for key in sex_to_num])

train.groupby("Sex").count()["Length"]

Sex
0    32555
1    27802
2    34435
Name: Length, dtype: int64

X = train.drop("Rings", axis=1)
y = train.Rings

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

Here stratify parameter keeps the ratio of Rings same all across the Dtaset

XGBoost

we will be implementing two XGBoost models

def xgb_objective(trial):
    params = {
        "eta": trial.suggest_float("eta", 0.01, 1.0),
        "gamma": 0.0,
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_float("min_child_weight", 1., 50.),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("lambda", 1.0, 100.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000)
    }
    
    xgb_reg = TransformedTargetRegressor(xgboost.XGBRegressor(**params, objective='reg:squarederror', grow_policy='lossguide',
                                                         tree_method="hist", random_state=42),
                                                         func=np.log1p,
                                                         inverse_func=np.expm1)
    xgb_reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    val_scores = mean_squared_log_error(y_valid, xgb_reg.predict(X_valid), squared=False)
    return val_scores

sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization
xgb_study = optuna.create_study(direction = 'minimize',study_name="XgbRegressor", sampler=sampler)

[I 2024-04-25 16:46:29,229] A new study created in memory with name: XgbRegressor

XGBoost 1st model

TUNE = False
if TUNE:
    xgb_study.optimize(xgb_objective, n_trials=500)

Set TUNE parameter to True incase you want to run Hyper Parameter Tuning

xgb_best_params_1 = {
    'eta': 0.1006321838798394,
    'max_depth': 6,
    'min_child_weight': 27.999752791085136,
    'subsample': 0.7344797943645852,
    'colsample_bytree': 0.5389765810810496,
    'lambda': 79.62358968148187,
    'n_estimators': 407
}

xgb_reg_1 = TransformedTargetRegressor(xgboost.XGBRegressor(**xgb_best_params_1, objective='reg:squarederror', grow_policy='lossguide',
                                                 tree_method="hist", random_state=42, gamma=0.0),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

xgb_reg_1.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.5389765810810496,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eta=0.1006321838798394,
                                              eval_metric=None,
                                              feature_types=None, gamma=0.0,
                                              gpu_id...
                                              grow_policy=&#x27;lossguide&#x27;,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              lambda=79.62358968148187,
                                              learning_rate=None,
                                              max_bin=None,
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=6, max_leaves=None,
                                              min_child_weight=27.999752791085136,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=407, n_jobs=None,
                                              num_parallel_tree=None, ...))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" ><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.5389765810810496,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eta=0.1006321838798394,
                                              eval_metric=None,
                                              feature_types=None, gamma=0.0,
                                              gpu_id...
                                              grow_policy=&#x27;lossguide&#x27;,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              lambda=79.62358968148187,
                                              learning_rate=None,
                                              max_bin=None,
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=6, max_leaves=None,
                                              min_child_weight=27.999752791085136,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=407, n_jobs=None,
                                              num_parallel_tree=None, ...))</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-2" type="checkbox" ><label for="sk-estimator-id-2" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: XGBRegressor</label><div class="sk-toggleable__content"><pre>XGBRegressor(base_score=None, booster=None, callbacks=None,
         colsample_bylevel=None, colsample_bynode=None,
         colsample_bytree=0.5389765810810496, early_stopping_rounds=None,
         enable_categorical=False, eta=0.1006321838798394, eval_metric=None,
         feature_types=None, gamma=0.0, gpu_id=None,
         grow_policy=&#x27;lossguide&#x27;, importance_type=None,
         interaction_constraints=None, lambda=79.62358968148187,
         learning_rate=None, max_bin=None, max_cat_threshold=None,
         max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
         max_leaves=None, min_child_weight=27.999752791085136, missing=nan,
         monotone_constraints=None, n_estimators=407, n_jobs=None,
         num_parallel_tree=None, ...)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-3" type="checkbox" ><label for="sk-estimator-id-3" class="sk-toggleable__label sk-toggleable__label-arrow">XGBRegressor</label><div class="sk-toggleable__content"><pre>XGBRegressor(base_score=None, booster=None, callbacks=None,
         colsample_bylevel=None, colsample_bynode=None,
         colsample_bytree=0.5389765810810496, early_stopping_rounds=None,
         enable_categorical=False, eta=0.1006321838798394, eval_metric=None,
         feature_types=None, gamma=0.0, gpu_id=None,
         grow_policy=&#x27;lossguide&#x27;, importance_type=None,
         interaction_constraints=None, lambda=79.62358968148187,
         learning_rate=None, max_bin=None, max_cat_threshold=None,
         max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
         max_leaves=None, min_child_weight=27.999752791085136, missing=nan,
         monotone_constraints=None, n_estimators=407, n_jobs=None,
         num_parallel_tree=None, ...)</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, xgb_reg_1.predict(X_valid), squared=False)

0.1484868328972631

feature_importance = xgb_reg_1.regressor_.feature_importances_
feature_names = X_train.columns

sorted_indices = feature_importance.argsort()
sorted_importance = feature_importance[sorted_indices]
sorted_features = feature_names[sorted_indices]

plt.figure(figsize=(10, 6))
colors = plt.cm.tab20c.colors[:len(sorted_features)]  
plt.barh(sorted_features, sorted_importance, color=colors)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importance')
plt.gca().invert_yaxis() 
plt.tight_layout()  
plt.show()

在这里插入图片描述

XGBoost 2

xgb_best_params_2 = {
    'eta': 0.08999645298052271,
    'max_depth': 6,
    'min_child_weight': 2.088127882610971,
    'subsample': 0.7725806961689413,
    'colsample_bytree': 0.9163306027660207,
    'lambda': 5.356530752285997,
    'n_estimators': 652
}

xgb_reg_2 = TransformedTargetRegressor(xgboost.XGBRegressor(**xgb_best_params_2, objective='reg:squaredlogerror', grow_policy='depthwise',
                                                          tree_method="hist", random_state=42),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

xgb_reg_2.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.9163306027660207,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eta=0.08999645298052271,
                                              eval_metric=None,
                                              feature_types=None,
                                              gamma=None, gpu_...
                                              grow_policy=&#x27;depthwise&#x27;,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              lambda=5.356530752285997,
                                              learning_rate=None,
                                              max_bin=None,
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=6, max_leaves=None,
                                              min_child_weight=2.088127882610971,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=652, n_jobs=None,
                                              num_parallel_tree=None, ...))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" ><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=XGBRegressor(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.9163306027660207,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eta=0.08999645298052271,
                                              eval_metric=None,
                                              feature_types=None,
                                              gamma=None, gpu_...
                                              grow_policy=&#x27;depthwise&#x27;,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              lambda=5.356530752285997,
                                              learning_rate=None,
                                              max_bin=None,
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=6, max_leaves=None,
                                              min_child_weight=2.088127882610971,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=652, n_jobs=None,
                                              num_parallel_tree=None, ...))</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-5" type="checkbox" ><label for="sk-estimator-id-5" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: XGBRegressor</label><div class="sk-toggleable__content"><pre>XGBRegressor(base_score=None, booster=None, callbacks=None,
         colsample_bylevel=None, colsample_bynode=None,
         colsample_bytree=0.9163306027660207, early_stopping_rounds=None,
         enable_categorical=False, eta=0.08999645298052271,
         eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
         grow_policy=&#x27;depthwise&#x27;, importance_type=None,
         interaction_constraints=None, lambda=5.356530752285997,
         learning_rate=None, max_bin=None, max_cat_threshold=None,
         max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
         max_leaves=None, min_child_weight=2.088127882610971, missing=nan,
         monotone_constraints=None, n_estimators=652, n_jobs=None,
         num_parallel_tree=None, ...)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-6" type="checkbox" ><label for="sk-estimator-id-6" class="sk-toggleable__label sk-toggleable__label-arrow">XGBRegressor</label><div class="sk-toggleable__content"><pre>XGBRegressor(base_score=None, booster=None, callbacks=None,
         colsample_bylevel=None, colsample_bynode=None,
         colsample_bytree=0.9163306027660207, early_stopping_rounds=None,
         enable_categorical=False, eta=0.08999645298052271,
         eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
         grow_policy=&#x27;depthwise&#x27;, importance_type=None,
         interaction_constraints=None, lambda=5.356530752285997,
         learning_rate=None, max_bin=None, max_cat_threshold=None,
         max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
         max_leaves=None, min_child_weight=2.088127882610971, missing=nan,
         monotone_constraints=None, n_estimators=652, n_jobs=None,
         num_parallel_tree=None, ...)</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, xgb_reg_2.predict(X_valid), squared=False)

0.14881444008796907

LIGHTGBM

def lgbm_objective(trial):
    # Define parameters to be optimized for the LGBMClassifier
    param = {
        "verbosity": -1,
        "random_state": 42,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "n_estimators": trial.suggest_int("n_estimators", 400, 1000),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 70),
        "num_leaves": trial.suggest_int("num_leaves", 30, 100),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.1, 1.0)
    }

    lgbm_reg = lightgbm.LGBMRegressor(**param)
    
    lgbm_reg.fit(X_train, y_train)

    score = mean_squared_log_error(y_valid, lgbm_reg.predict(X_valid), squared=False)

    return score


# Set up the sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object for Optuna optimization
lgbm_study = optuna.create_study(direction="minimize", sampler=sampler)

[I 2024-04-25 16:46:32,141] A new study created in memory with name: no-name-889f25f6-876c-4982-ba46-f71528b83793

if TUNE:
    # Run the optimization process
    lgbm_study.optimize(lambda trial: lgbm_objective(trial), n_trials=200)

    # Get the best parameters after optimization
    lgbm_best_params = lgbm_study.best_params

    print('='*50)
    print(lgbm_best_params)

LIGHTGbm 1

lgbm_params_1 = {
    'learning_rate': 0.04090453688322824,
    'n_estimators': 788,
    'reg_lambda': 29.248167932522765,
    'reg_alpha': 0.4583079398945705,
    'max_depth': 19,
    'colsample_bytree': 0.5439642175304692,
    'subsample': 0.8659762900446526,
    'min_child_samples': 12,
    'num_leaves': 69,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

lgbm_reg_1 = TransformedTargetRegressor(lightgbm.LGBMRegressor(**lgbm_params_1),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

lgbm_reg_1.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=LGBMRegressor(colsample_bytree=0.5439642175304692,
                                               learning_rate=0.04090453688322824,
                                               max_depth=19,
                                               min_child_samples=12,
                                               n_estimators=788, n_jobs=-1,
                                               num_leaves=69,
                                               random_state=42,
                                               reg_alpha=0.4583079398945705,
                                               reg_lambda=29.248167932522765,
                                               subsample=0.8659762900446526,
                                               verbose=-1))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-7" type="checkbox" ><label for="sk-estimator-id-7" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=LGBMRegressor(colsample_bytree=0.5439642175304692,
                                               learning_rate=0.04090453688322824,
                                               max_depth=19,
                                               min_child_samples=12,
                                               n_estimators=788, n_jobs=-1,
                                               num_leaves=69,
                                               random_state=42,
                                               reg_alpha=0.4583079398945705,
                                               reg_lambda=29.248167932522765,
                                               subsample=0.8659762900446526,
                                               verbose=-1))</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-8" type="checkbox" ><label for="sk-estimator-id-8" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: LGBMRegressor</label><div class="sk-toggleable__content"><pre>LGBMRegressor(colsample_bytree=0.5439642175304692,
          learning_rate=0.04090453688322824, max_depth=19,
          min_child_samples=12, n_estimators=788, n_jobs=-1, num_leaves=69,
          random_state=42, reg_alpha=0.4583079398945705,
          reg_lambda=29.248167932522765, subsample=0.8659762900446526,
          verbose=-1)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-9" type="checkbox" ><label for="sk-estimator-id-9" class="sk-toggleable__label sk-toggleable__label-arrow">LGBMRegressor</label><div class="sk-toggleable__content"><pre>LGBMRegressor(colsample_bytree=0.5439642175304692,
          learning_rate=0.04090453688322824, max_depth=19,
          min_child_samples=12, n_estimators=788, n_jobs=-1, num_leaves=69,
          random_state=42, reg_alpha=0.4583079398945705,
          reg_lambda=29.248167932522765, subsample=0.8659762900446526,
          verbose=-1)</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, lgbm_reg_1.predict(X_valid), squared=False)

0.1477381305885499

feature_importance = lgbm_reg_1.regressor_.feature_importances_

feature_names = X_train.columns

sorted_indices = feature_importance.argsort()
sorted_importance = feature_importance[sorted_indices]
sorted_features = feature_names[sorted_indices]

# Plot feature importance
plt.figure(figsize=(12, 8))
colors = plt.cm.Paired.colors[:len(sorted_features)]  
plt.barh(sorted_features, sorted_importance, color=colors)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('LightGBM Feature Importance', fontsize=14)
plt.gca().invert_yaxis() 

for i, v in enumerate(sorted_importance):
    plt.text(v + 0.02, i, f'{v:.2f}', color='black', va='center', fontsize=10)

plt.tight_layout()  
plt.show()

在这里插入图片描述

LIGHTGbm 2

lgbm_params_2 = {
    'n_jobs': -1,
    'verbose': -1,
    'max_depth': 20,
    'num_leaves': 165,
    'subsample_freq': 1,
    'random_state': 42,
    'n_estimators': 1460,
    'min_child_samples': 25,
    'reg_lambda': 6.13475387151606,
    'subsample': 0.8036874216939632,
    'reg_alpha': 0.3152990674231573,
    'learning_rate': 0.009336479469693189,
    'colsample_bytree': 0.5780931837049811,
    'min_child_weight': 0.37333232256934057,
}

lgbm_reg_2 = TransformedTargetRegressor(lightgbm.LGBMRegressor(**lgbm_params_2),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

lgbm_reg_2.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=LGBMRegressor(colsample_bytree=0.5780931837049811,
                                               learning_rate=0.009336479469693189,
                                               max_depth=20,
                                               min_child_samples=25,
                                               min_child_weight=0.37333232256934057,
                                               n_estimators=1460, n_jobs=-1,
                                               num_leaves=165,
                                               random_state=42,
                                               reg_alpha=0.3152990674231573,
                                               reg_lambda=6.13475387151606,
                                               subsample=0.8036874216939632,
                                               subsample_freq=1,
                                               verbose=-1))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-10" type="checkbox" ><label for="sk-estimator-id-10" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=LGBMRegressor(colsample_bytree=0.5780931837049811,
                                               learning_rate=0.009336479469693189,
                                               max_depth=20,
                                               min_child_samples=25,
                                               min_child_weight=0.37333232256934057,
                                               n_estimators=1460, n_jobs=-1,
                                               num_leaves=165,
                                               random_state=42,
                                               reg_alpha=0.3152990674231573,
                                               reg_lambda=6.13475387151606,
                                               subsample=0.8036874216939632,
                                               subsample_freq=1,
                                               verbose=-1))</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-11" type="checkbox" ><label for="sk-estimator-id-11" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: LGBMRegressor</label><div class="sk-toggleable__content"><pre>LGBMRegressor(colsample_bytree=0.5780931837049811,
          learning_rate=0.009336479469693189, max_depth=20,
          min_child_samples=25, min_child_weight=0.37333232256934057,
          n_estimators=1460, n_jobs=-1, num_leaves=165, random_state=42,
          reg_alpha=0.3152990674231573, reg_lambda=6.13475387151606,
          subsample=0.8036874216939632, subsample_freq=1, verbose=-1)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-12" type="checkbox" ><label for="sk-estimator-id-12" class="sk-toggleable__label sk-toggleable__label-arrow">LGBMRegressor</label><div class="sk-toggleable__content"><pre>LGBMRegressor(colsample_bytree=0.5780931837049811,
          learning_rate=0.009336479469693189, max_depth=20,
          min_child_samples=25, min_child_weight=0.37333232256934057,
          n_estimators=1460, n_jobs=-1, num_leaves=165, random_state=42,
          reg_alpha=0.3152990674231573, reg_lambda=6.13475387151606,
          subsample=0.8036874216939632, subsample_freq=1, verbose=-1)</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, lgbm_reg_2.predict(X_valid), squared=False)

0.14758741259851116

CatBoost

def cb_objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "max_depth": trial.suggest_int("depth", 4, 16),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1500),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
    }
    
    cb_reg = TransformedTargetRegressor(catboost.CatBoostRegressor(**params, random_state=42, grow_policy='SymmetricTree',
                                                                  random_strength=0, cat_features=["Sex"], loss_function="RMSE"),
                                                     func=np.log1p,
                                                     inverse_func=np.expm1)
    cb_reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    val_scores = np.sqrt(mean_squared_log_error(y_valid, np.abs(cb_reg.predict(X_valid))))
    return val_scores

sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization
cb_study = optuna.create_study(direction = 'minimize',study_name="CBRegressor", sampler=sampler)

[I 2024-04-25 16:46:44,532] A new study created in memory with name: CBRegressor

if TUNE:
    cb_study.optimize(cb_objective, 30)

CatBoost 1

cb_params_1 = {
    'grow_policy': 'SymmetricTree', 
    'n_estimators': 1000, 
    'learning_rate': 0.128912681527133, 
    'l2_leaf_reg': 1.836927907521674, 
    'max_depth': 6, 
    'colsample_bylevel': 0.6775373040510968, 
    'random_strength': 0, 
    'boost_from_average': True, 
    'loss_function': 'RMSE', 
    'cat_features': ['Sex'], 
    'verbose': False
    }

cat_reg_1 = TransformedTargetRegressor(catboost.CatBoostRegressor(**cb_params_1),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

cat_reg_1.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=&lt;catboost.core.CatBoostRegressor object at 0x0000021D08EB7310&gt;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-13" type="checkbox" ><label for="sk-estimator-id-13" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=&lt;catboost.core.CatBoostRegressor object at 0x0000021D08EB7310&gt;)</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-14" type="checkbox" ><label for="sk-estimator-id-14" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: CatBoostRegressor</label><div class="sk-toggleable__content"><pre>&lt;catboost.core.CatBoostRegressor object at 0x0000021D08EB7310&gt;</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-15" type="checkbox" ><label for="sk-estimator-id-15" class="sk-toggleable__label sk-toggleable__label-arrow">CatBoostRegressor</label><div class="sk-toggleable__content"><pre>&lt;catboost.core.CatBoostRegressor object at 0x0000021D08EB7310&gt;</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, cat_reg_1.predict(X_valid), squared=False)

0.14824841372252795

CatBoost 2

cb_params_2 = {
    'depth': 15, 
    'verbose': 0,
    'max_bin': 464, 
    'verbose': False,
    'random_state':42,
    'task_type': 'CPU', 
    'random_state': 42,
    'min_data_in_leaf': 78, 
    'loss_function': 'RMSE', 
    'grow_policy': 'Lossguide', 
    'bootstrap_type': 'Bernoulli', 
    'subsample': 0.83862137638162, 
    'l2_leaf_reg': 8.365422739510098, 
    'random_strength': 3.296124856352495, 
    'learning_rate': 0.09992185242598203,
}

cat_reg_2 = TransformedTargetRegressor(catboost.CatBoostRegressor(**cb_params_2),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)

cat_reg_2.fit(X_train, y_train)

TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
                       regressor=&lt;catboost.core.CatBoostRegressor object at 0x0000021D088EB4C0&gt;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-16" type="checkbox" ><label for="sk-estimator-id-16" class="sk-toggleable__label sk-toggleable__label-arrow">TransformedTargetRegressor</label><div class="sk-toggleable__content"><pre>TransformedTargetRegressor(func=&lt;ufunc &#x27;log1p&#x27;&gt;, inverse_func=&lt;ufunc &#x27;expm1&#x27;&gt;,
                       regressor=&lt;catboost.core.CatBoostRegressor object at 0x0000021D088EB4C0&gt;)</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-17" type="checkbox" ><label for="sk-estimator-id-17" class="sk-toggleable__label sk-toggleable__label-arrow">regressor: CatBoostRegressor</label><div class="sk-toggleable__content"><pre>&lt;catboost.core.CatBoostRegressor object at 0x0000021D088EB4C0&gt;</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-18" type="checkbox" ><label for="sk-estimator-id-18" class="sk-toggleable__label sk-toggleable__label-arrow">CatBoostRegressor</label><div class="sk-toggleable__content"><pre>&lt;catboost.core.CatBoostRegressor object at 0x0000021D088EB4C0&gt;</pre></div></div></div></div></div></div></div></div></div></div>

mean_squared_log_error(y_valid, cat_reg_2.predict(X_valid), squared=False)

0.14774083919007364

Ensembling the Results Using VotingRegressor 使用 VotingRegressor 组合结果

# weights = [0.025, 0.025, 0.275, 0.275, 0.05, 0.35]

ensemble = VotingRegressor(
    [
#         ("xgb_1", xgb_reg_1),
#         ("xgb_2", xgb_reg_2),
        ("lgbm_1", lgbm_reg_1),
        ("lgbm_2", lgbm_reg_2),
        ("cb_1", cat_reg_1),
        ("cb_2", cat_reg_2)
    ]
)

ensemble.fit(X, y)

在这里插入图片描述

Submit the Output

pred = ensemble.predict(test.drop("id", axis=1))

submission = pd.DataFrame(test.id)

submission["Rings"] = pred

submission.to_csv("submission.csv", index=False)