1.首先生成随机数据
import random
# 创建一个153629行,13列的数据,
random_data = np.random.randn(153629, 13)
#创建数据标签,后续将根据数据标签绘制不同的箱型图
label = [0,1,2,3]
labels = np.asarray(random.choices(label,k =153629))
label_reshape = labels[:,np.newaxis]
将数据与标签在第二维度上进行拼接
data = np.hstack((random_data,label_reshape))
#data.shape
#为数据添加列标签,并转换为dataframe格式
data_list = ['red','gre','blu','x','y','mean','Variance','Homogeneity','Contrast','Dissimilarity','Entropy','Second','Correlation','label']
img_feature = pd.DataFrame(data,columns=data_list)
img_feature.head(5)
np.hstack 是 NumPy(Numerical Python 的简称)中的一个函数,用于水平堆叠数组(即沿第二个轴,也就是列方向堆叠)。如果传入的数组不是一维的,那么 np.hstack 会将它们沿着第二个轴(索引为1的轴)堆叠起来。
2. 数据对齐
这一步本来是为方便将数据按照不同的数据类型与标签进行分类(例如:分别提取出四类标签分别对应means数据,并将其合并为一个新的DF-Datafreame数据)
min_length = min([img_feature[img_feature['label']==0].shape[0],img_feature[img_feature['label']==1].shape[0],img_feature[img_feature['label']==2].shape[0],img_feature[img_feature['label']==3].shape[0]])
print(min_length)
根据min_length的数量,利用位置索引从不同标签数据中随机采样min_lengh个样本数据
利用bool索引对
# 从每个类别数据随机选取
label_0_len = img_feature[img_feature['label']==0].shape[0]
# 获取每个标签类型的长度
label_1_len = img_feature[img_feature['label']==1].shape[0]
label_2_len = img_feature[img_feature['label']==2].shape[0]
label_3_len = img_feature[img_feature['label']==3].shape[0]
# 标签0
print("0标签:")
label0_bool_index = create_bool(label_0_len,min_length)
print("标签0数量",len(label0_bool_index))
print("标签0有效索引数量",len(np.where(label0_bool_index)[0]))
# 标签1
print("1标签:")
label1_bool_index = create_bool(label_1_len,min_length)
print("标签1数量",len(label1_bool_index))
print("标签1有效果索引数量",sum(np.asarray(label1_bool_index)))
# 标签2
print("2标签:")
label2_bool_index = create_bool(label_2_len,min_length)
print("标签2数量",len(label2_bool_index))
print("标签2有效果索引数量",sum(label2_bool_index))
# 标签3
print("3标签:")
label3_bool_index = create_bool(label_3_len,min_length)
print("标签3数量",len(label3_bool_index))
print("标签3有效果索引数量",sum(label3_bool_index))
label_index = {"label0":label0_bool_index,"label1":label1_bool_index,"label2":label2_bool_index,"label3":label3_bool_index}
根据输入数据、数据类型、标签索引,构建一个不同标签对某一数据类型对比数据框
#数据统计
def diff_label_static(img_feature,feature_flag,label_bool_index):
# 获取对应特征标签
label_0data = img_feature[img_feature['label']==0]
# 将布尔数据转换为位置索引数据
label_0slice_data = label_0data.iloc[np.where(label_bool_index["label0"])[0]]
# 获取对应特征标签
label_1data = img_feature[img_feature['label']==1]
# 将布尔数据转换为位置索引数据
label_1slice_data = label_1data.iloc[np.where(label_bool_index['label1'])[0]]
# 获取对应特征标签
label_2data = img_feature[img_feature['label']==2]
# 将布尔数据转换为位置索引数据
label_2slice_data = label_2data.iloc[np.where(label_bool_index['label2'])[0]]
# 获取对应特征标签
label_3data = img_feature[img_feature['label']==3]
# 将布尔数据转换为位置索引数据
label_3slice_data = label_3data.iloc[np.where(label_bool_index['label3'])[0]]
temp = np.stack([np.asarray(label_0slice_data[feature_flag]),np.asarray(label_1slice_data[feature_flag]),np.asarray(label_2slice_data[feature_flag]),np.asarray(label_3slice_data[feature_flag])],axis=1)
label_list = ['label0','label1','label2','label3']
mean_static = pd.DataFrame(temp,columns=label_list)
return mean_static
绘制不同标签数据的box箱型图,需要向将数据按第一维度进行拼接(这一步应该可以优化一下)
def diff_label_static_plot(img_feature,feature_flag,label_bool_index):
# 对函数进行修改
# 首先获取某一标签类型数据,相应的根据标签长度,创建对应的标签序列
label_0data = img_feature[img_feature['label']==0]
label_0slice_data = label_0data.iloc[np.where(label_bool_index["label0"])[0]]
# 新的0标签序列
new_label0 = np.ones(len(label_0slice_data))*0
# 获取对应特征标签
label_1data = img_feature[img_feature['label']==1]
# 将布尔数据转换为位置索引数据
label_1slice_data = label_1data.iloc[np.where(label_bool_index['label1'])[0]]
# 新的1标签序列
new_label1 = np.ones(len(label_0slice_data))*1
# 获取对应特征标签
label_2data = img_feature[img_feature['label']==2]
# 将布尔数据转换为位置索引数据
label_2slice_data = label_2data.iloc[np.where(label_bool_index['label2'])[0]]
# 新的1标签序列
new_label2 = np.ones(len(label_2slice_data))*2
# 获取对应特征标签
label_3data = img_feature[img_feature['label']==3]
# 将布尔数据转换为位置索引数据
label_3slice_data = label_3data.iloc[np.where(label_bool_index['label3'])[0]]
# 新的3标签序列
new_label3 = np.ones(len(label_3slice_data))*3
# print(np.asarray(label_0slice_data[feature_flag]).shape)
temp_data = np.hstack([np.asarray(label_0slice_data[feature_flag]),np.asarray(label_1slice_data[feature_flag]),np.asarray(label_2slice_data[feature_flag]),np.asarray(label_3slice_data[feature_flag])])
# print(temp_data.shape)
label_list = ['data','label']
temp_label = np.hstack([new_label0,new_label1,new_label2,new_label3])
# print(temp_label.shape)
data = np.stack([temp_data,temp_label],axis=1)
mean_static = pd.DataFrame(data,columns=label_list)
return mean_static
3.绘制箱型图
# 不同标签的特征统计量函数绘制图片
import matplotlib.pyplot as plt
data_flag = ['red','gre','blu','mean','Variance','Homogeneity','Contrast','Dissimilarity','Entropy','Second','Correlation','label']
labels = ['Label1', 'Label2', 'Label3', 'Label4'] # 这里是新的刻度标签列表
def draw_subboxplot(img_feature,data_flag,label_bool_index,nrows_ncols=[3,4]):
fig, axs = plt.subplots(nrows=nrows_ncols[0], ncols=nrows_ncols[1], figsize=(12, 12), sharey=False)
# fig, axs = plt.subplots(...): 这部分代码创建一个图形(fig)和一组子图(axs)。fig 是整个图形的引用,而 axs 是一个 NumPy 数组,包含了所有的子图对象。
idx = 0
for i in range(nrows_ncols[0]):
for j in range(nrows_ncols[1]):
# 获取某一标签类型数据,相应的根据标签长度,创建对应的标签序列
df = diff_label_static_plot(img_feature,data_flag[idx],label_bool_index)
df_static = diff_label_static(img_feature,data_flag[idx],label_bool_index)
data = np.asarray(df_static.describe())
min_static = min(data[3,:])
max_static = max(data[7,:])
# print("最大值",max_static)
df.plot.box(ax=axs[i, j],by="label",xlabel = 'class', rot=45,return_type='axes')
# print([i, j])
print("数据类型索引序号:",idx)
print("数据类型",data_flag[idx])
axs[i, j].set_xticklabels(labels)
axs[i, j].set_title(data_flag[idx])
# 这一步处理,是由于原有数据中存在较大差异例如0~255与0~1的数据分布,因此在处理坐标轴范围与刻度时需要按情况分开处理
if max_static>10:
# min_static,max_static = int(min_static),int(max_static)
# print(np.round(np.linspace(min_static,max_static,num=4),1))
axs[i, j].set_ylim(min_static,max_static)
axs[i, j].set_yticks(np.round(np.linspace(min_static,max_static,num=4),0))
axs[i, j].set_yticklabels(np.round(np.linspace(min_static,max_static,num=4),0))
else:
min_static,max_static = round(min_static,2),round(max_static,2)
axs[i, j].set_ylim(min_static,max_static)
axs[i, j].set_yticks(np.round(np.linspace(min_static,max_static,num=4),2))
axs[i, j].set_yticklabels(np.round(np.linspace(min_static,max_static,num=4),1))
# 自适应调整,使坐标轴与标签可以正常显示
plt.tight_layout()
idx +=1
plt.show()
draw_subboxplot(img_feature,data_flag,label_index,nrows_ncols=[3,4])
需要注意的是 plt.subplots(nrows=nrows_ncols[0], ncols=nrows_ncols[1], figsize=(12, 12), sharey=False)中sharey=False是共享Y轴坐标系,如果需要为每个子图设置不同的坐标轴与坐标系范围,需要将其设置为False
可视化显示
参考文档:
pandas
matlibplot