# AI夏令营 # Datawhale # 夏令营
在原有的Baseline上除了交叉验证,还有一种关键的优化方式,即特征工程。
如何优化特征,关系着我们提高模型预测的精准度。特征工程往往是对问题的领域有深入了解的人员能够做好的部分,因为我们要思考转换的方式。
Smiles特征之外还有很多特征可以提取有价值的信息,比如InChI是由一系列部分组成,提供了关于分子结构的详细信息。比如开头标识、分子式、连接表、氢原子计数、多可旋转键计数、立体化学信息、同分异构体信息、混合物或互变异构体信息、电荷和自旋多重度信息等。
除此之外,要想提升模型的精准度,换模型也未尝不可。
特征优化
提取分子式
从InChI字符串中,我们可以看到分子式直接给出在/C47H61N7O6S
部分。这意味着分子由47个碳原子、61个氢原子、7个氮原子、6个氧原子和1个硫原子组成;
计算分子量
分子量可以通过将每种原子的原子质量乘以其数量然后相加得到。
如
-
碳(C)的原子质量约为12.01 g/mol
-
氢(H)的原子质量约为1.008 g/mol
-
氮(N)的原子质量约为14.01 g/mol
-
氧(O)的原子质量约为16.00 g/mol
-
硫(S)的原子质量约为32.07 g/mol
乘以数量相加,我们就可以得到分子量。
原子计数
直接计算不同原子的个数,并进行展开。
import pandas as pd
import re
atomic_masses = {
'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,
'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,
'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,
'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,
'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}
# 函数用于解析单个InChI字符串
def parse_inchi(row):
inchi_str = row['InChI']
formula = ''
molecular_weight = 0
element_counts = {}
# 提取分子式
formula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)
if formula_match:
formula = formula_match.group(1)
# 计算分子量和原子计数
for element, count in re.findall(r"([A-Z][a-z]*)([0-9]*)", formula):
count = int(count) if count else 1
element_mass = atomic_masses.get(element.upper(), 0)
molecular_weight += element_mass * count
element_counts[element.upper()] = count
return pd.Series({
'Formula': formula,
'MolecularWeight': molecular_weight,
'ElementCounts': element_counts
})
# 应用函数到DataFrame的每一行
train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)
# 定义存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']
# 创建一个空的DataFrame,列名为keys
df_expanded = pd.DataFrame({key: pd.Series() for key in keys})
# 遍历数据,填充DataFrame
for index, item in enumerate(train['ElementCounts'].values):
for key in keys:
# 将字典中的值填充到相应的列中
df_expanded.at[index, key] = item.get(key, 0)
df_expanded = pd.DataFrame(df_expanded)
模型融合
上次提到了我们使用的是catboost模型,没有尝试过lightgbm和xgboost,可以依次跑完这三个模型,然后对三个模型的结果进行取平均进行融合(也是可以改进的地方)。
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'min_child_weight': 6,
'num_leaves': 2 ** 6,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.35,
'seed': 2024,
'nthread' : 16,
'verbose' : -1,
}
model = clf.train(params, train_matrix, 2000, valid_sets=[train_matrix, valid_matrix],
categorical_feature=[], verbose_eval=1000, early_stopping_rounds=100)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'num_class':3,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.35,
'tree_method': 'hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2024,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=2000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=1000,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict_proba(val_x)
test_pred = model.predict_proba(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
F1_score = f1_score(val_y, np.where(val_pred>0.5, 1, 0))
cv_scores.append(F1_score)
print(cv_scores)
return oof, test_predict
# 参考demo,具体对照baseline实践部分调用cv_model函数
# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3
或者可以用stacking的方法
代码更正
要想在飞桨上跑数据,复制粘贴是不行的。由于部分模型版本更迭,下面给出完整的更正后代码。
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
warnings.filterwarnings('ignore')
train = pd.read_excel('./data/data280993/traindata-new.xlsx')
test = pd.read_excel('./data/data280993/testdata-new.xlsx')
# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)
# 定义了一个空列表drop_cols,用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
if test[f].notnull().sum() < 10:
drop_cols.append(f)
# 使用drop方法从训练集和测试集中删除了这些列,以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame,便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]
# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))
# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])
# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]
# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)
# 自然数编码
def label_encode(series):
unique = list(series.unique())
return series.map(dict(zip(
unique, range(series.nunique())
)))
for col in cols:
if data[col].dtype == 'object':
data[col] = label_encode(data[col])
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)
lgb不支持特殊字符的输入,因此我们需要重写特征名称:
import re
def strict_clean_feature_name(name):
# 只保留字母、数字和下划线
name = re.sub(r'[^a-zA-Z0-9_]', '', name)
# 确保名称不为空,并且不以数字开头
if not name or name[0].isdigit():
name = 'f_' + name
return name
# 应用新的清理函数
x_train.columns = [strict_clean_feature_name(col) for col in x_train.columns]
x_test.columns = [strict_clean_feature_name(col) for col in x_test.columns]
# 再次检查清理后的特征名称
print(x_train.columns)
模型融合:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
atomic_masses = {
'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,
'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,
'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,
'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,
'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}
def parse_inchi(row):
inchi_str = row['InChI'] # Assuming 'InChI' is a column in your DataFrame
if isinstance(inchi_str, str): # Check if inchi_str is a string
formula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)
if formula_match:
formula = formula_match.group(1)
molecular_weight = calculate_molecular_weight(formula) # You need to define this function
element_counts = extract_element_counts(formula) # You need to define this function
return pd.Series({
'Formula': formula,
'MolecularWeight': molecular_weight,
'ElementCounts': element_counts
})
else:
# Handle case where regex pattern does not match
return pd.Series({
'Formula': None,
'MolecularWeight': None,
'ElementCounts': None
})
else:
# Handle case where inchi_str is not a string (e.g., it could be None or unexpected type)
return pd.Series({
'Formula': None,
'MolecularWeight': None,
'ElementCounts': None
})
# 应用函数到DataFrame的每一行
train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)
# 定义存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']
# 创建一个空的DataFrame,列名为keys
df_expanded = pd.DataFrame({key: pd.Series() for key in keys})
for index, item in enumerate(train['ElementCounts'].values):
if item is not None:
for key in keys:
# 将字典中的值填充到相应的列中
df_expanded.at[index, key] = item.get(key, 0)
else:
# 如果 item 是 None,则所有元素计数设为 0
for key in keys:
df_expanded.at[index, key] = 0
df_expanded = pd.DataFrame(df_expanded)
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'min_child_weight': 6,
'num_leaves': 2 ** 6,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.35,
'seed': 2024,
'verbose': -1,
}
model = clf.train(
params,
train_matrix,
num_boost_round=2000,
valid_sets=[train_matrix, valid_matrix],
categorical_feature=[],
callbacks=[
lgb.early_stopping(stopping_rounds=100),
lgb.log_evaluation(period=1000)
]
)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'num_class':3,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.35,
'tree_method': 'hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2024,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=2000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=1000,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict_proba(val_x)
test_pred = model.predict_proba(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
F1_score = f1_score(val_y, np.where(val_pred>0.5, 1, 0))
cv_scores.append(F1_score)
print(cv_scores)
return oof, test_predict
# 参考demo,具体对照baseline实践部分调用cv_model函数
# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3
跑的比较慢。
Stacking
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):
'''
输入的oof_1, oof_2, oof_3可以对应lgb_oof,xgb_oof,cat_oof
predictions_1, predictions_2, predictions_3对应lgb_test,xgb_test,cat_test
'''
train_stack = pd.concat([oof_1, oof_2, oof_3], axis=1)
test_stack = pd.concat([predictions_1, predictions_2, predictions_3], axis=1)
oof = np.zeros((train_stack.shape[0],))
predictions = np.zeros((test_stack.shape[0],))
scores = []
from sklearn.model_selection import RepeatedKFold
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)):
print("fold n°{}".format(fold_+1))
trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]
val_data, val_y = train_stack.loc[val_idx], y[val_idx]
clf = Ridge(random_state=2024)
clf.fit(trn_data, trn_y)
oof[val_idx] = clf.predict(val_data)
predictions += clf.predict(test_stack) / (5 * 2)
score_single = roc_auc_score(val_y, oof[val_idx])
scores.append(score_single)
print(f'{fold_+1}/{5}', score_single)
print('mean: ',np.mean(scores))
return oof, predictions
它接受之前三个模型传入的参数。
stacking是一种分层模型集成框架。以两层为例,第一层由多个基学习器组成,其输入为原始训练集,第二层的模型则是以第一层基学习器的输出作为特征加入训练集进行再训练,从而得到完整的stacking模型。
第一层和k折交叉验证类似,取平均,第二层stacking,将训练集中的四个标签外加真实标签当作五列新的特征作为新的训练集,选取一个训练模型,根据新的训练集进行训练,然后应用测试集的四个标签组成的测试集进行预测作为最终的result。