一.
下载并成功运行Anaconda,jupyter book ,spyder
输入检验(print (“hello”))
二.
在jupyter prompt中安装库:
- 找到anaconda 的Scripts库,并复制路径以备后面安装命令
D:\Program Files\anaconda3\Scripts
- 进入prompt命令界面输入pip:
①第一个pip命令
检查pip是否成功
②第二个pip命令
③第三、四个pip命令
三.输入代码,依次运行
1.
import io, os, sys, types, time, datetime, math, random
import requests, subprocess,io, tempfile
2.
#导入第三方库
# 数据处理
import numpy as np
import pandas as pd
# 数据可视化
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
from pandas.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
# 特征选择和编码
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize
# 机器学习
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
# 网格搜索、随机搜索
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# 模型度量(分类)
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc
# 警告处理
import warnings
warnings.filterwarnings('ignore')
# 在 Jupyter 上画图
%matplotlib inline
3.
# 字段名
headers = ['age', 'workclass', 'fnlwgt',
'education', 'education-num',
'marital-status', 'occupation',
'relationship', 'race', 'sex',
' capital-gain', 'capital-loss',
'hours-per-week', 'native-country',
'predclass']
# 加载训练集
# 读数据时 如何处理缺失值
training_raw = pd.read_csv('C://Users//Administrator//Desktop//adult.data',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python')
# 加载测试集
test_raw = pd.read_csv('C://Users//Administrator//Desktop//adult.test',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python',
skiprows=1)
test_raw.shape # : (16281, 15),数据的维度# 训练集和测试集加到一起做分析
dataset_raw = training_raw._append(test_raw) # 合并数据集# 为了避免索引引起的不必要错误,对索引进行处理
dataset_raw.reset_index(inplace=True) # 还原索引为数据
dataset_raw.drop('index', inplace=True, axis=1) # 删除还原的索引
#查看 DataFrame 占用内存
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB","YB")
i = int(math.floor(math.log(size_bytes, 1024))) # 获取占用内存的级别(向下取整)
p = math.pow(1024, i)
s = round(size_bytes / p, 2) # 获取占用内存的大小,四舍五入,保留两位小数
# 返回数据对应的内存空间的大小。
# memory_usage:返回 DataFrame 中每个 series 对应的内存大小。
# sum:求和
return "%s %s" % (s, size_name[i])
convert_size(dataset_raw.memory_usage().sum())
- 运行结果
五、
dataset_raw.describe()
data
set_raw.describe(include=['O'])