训练误差or测试误差与特征个数之间的关系--基于R语言实现

a 生成数据集,数据由 Y = X β + ϵ Y=X\beta+\epsilon Y=+ϵ产生,其中 p = 20 , n = 1000 p=20,n=1000 p=20n=1000

#way1
set.seed(1)
p = 20
n = 1000
x = matrix(rnorm(n*p), n, p)
B = rnorm(p)
B[3] = 0
B[4] = 0
B[9] = 0
B[19] = 0
B[10] = 0
eps = rnorm(p)
y = x %*% B + eps#%*%为矩阵乘法
#way2
set.seed(1)
a=rnorm(20*1000)
x=matrix(a,1000,20)
eps=rnorm(1000)
beta=c(1,1,0,0,5.5,2,5,0,4,0,1.5,11,10.5,3.3,2.8,0,9,0,2,6.6)
y=x%*%beta+eps#%*%为矩阵乘法

其中部分元素为0。

b 划分数据为训练集和测试集

#way1
train = sample(seq(1000), 100, replace = FALSE)
y.train = y[train,]
y.test = y[-train,]
x.train = x[train,]
x.test = x[-train,]
#way2
train=sample(1:1000,100,rep=F)
test=(-train)

c 训练集MSE分析

#way1
library(leaps)
regfit.full = regsubsets(y~., data=data.frame(x=x.train, y=y.train), nvmax=p)
val.errors = rep(NA, p)
x_cols = colnames(x, do.NULL=FALSE, prefix="x.")
for (i in 1:p) {
  coefi = coef(regfit.full, id=i)
  pred = as.matrix(x.train[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %in% x_cols]
  val.errors[i] = mean((y.train - pred)^2)
}
plot(val.errors, ylab="Training MSE", pch=19, type="b")

在这里插入图片描述

#way2
library(leaps)
d=data.frame(y,x)
fit1=regsubsets(y~.,data=d,subset=train,nvmax=20)
s1=summary(fit1)
mse=(s1$rss)/100
mse
which.min(mse)
plot(1:20,mse,type="b",xlab="number of predictors",ylab="traininng MSE")

在这里插入图片描述

> d=data.frame(y,x)
> fit1=regsubsets(y~.,data=d,subset=train,nvmax=20)
> s1=summary(fit1)
> s1
Subset selection object
Call: regsubsets.formula(y ~ ., data = d, subset = train, nvmax = 20)
20 Variables  (and intercept)
    Forced in Forced out
X1      FALSE      FALSE
X2      FALSE      FALSE
X3      FALSE      FALSE
X4      FALSE      FALSE
X5      FALSE      FALSE
X6      FALSE      FALSE
X7      FALSE      FALSE
X8      FALSE      FALSE
X9      FALSE      FALSE
X10     FALSE      FALSE
X11     FALSE      FALSE
X12     FALSE      FALSE
X13     FALSE      FALSE
X14     FALSE      FALSE
X15     FALSE      FALSE
X16     FALSE      FALSE
X17     FALSE      FALSE
X18     FALSE      FALSE
X19     FALSE      FALSE
X20     FALSE      FALSE
1 subsets of each size up to 20
Selection Algorithm: exhaustive
          X1  X2  X3  X4  X5  X6  X7  X8  X9  X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
1  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " "*" " " " " " " " " " " " " " " " "
2  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " "*" " " " " " " " " "*" " " " " " "
3  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " "*" "*" " " " " " " "*" " " " " " "
4  ( 1 )  " " " " " " " " " " " " " " " " " " " " " " "*" "*" " " " " " " "*" " " " " "*"
5  ( 1 )  " " " " " " " " "*" " " " " " " " " " " " " "*" "*" " " " " " " "*" " " " " "*"
6  ( 1 )  " " " " " " " " "*" " " "*" " " " " " " " " "*" "*" " " " " " " "*" " " " " "*"
7  ( 1 )  " " " " " " " " "*" " " "*" " " "*" " " " " "*" "*" " " " " " " "*" " " " " "*"
8  ( 1 )  " " " " " " " " "*" " " "*" " " "*" " " " " "*" "*" "*" " " " " "*" " " " " "*"
9  ( 1 )  " " " " " " " " "*" " " "*" " " "*" " " " " "*" "*" "*" "*" " " "*" " " " " "*"
10  ( 1 ) " " " " " " " " "*" " " "*" " " "*" " " " " "*" "*" "*" "*" " " "*" " " "*" "*"
11  ( 1 ) " " " " " " " " "*" "*" "*" " " "*" " " " " "*" "*" "*" "*" " " "*" " " "*" "*"
12  ( 1 ) " " " " " " " " "*" "*" "*" " " "*" " " "*" "*" "*" "*" "*" " " "*" " " "*" "*"
13  ( 1 ) " " "*" " " " " "*" "*" "*" " " "*" " " "*" "*" "*" "*" "*" " " "*" " " "*" "*"
14  ( 1 ) "*" "*" " " " " "*" "*" "*" " " "*" " " "*" "*" "*" "*" "*" " " "*" " " "*" "*"
15  ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" " " "*" "*"
16  ( 1 ) "*" "*" " " " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
17  ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
18  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
19  ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
20  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

在这里插入图片描述

在这里插入图片描述

d 测试集MSE分析

#way1
val.errors = rep(NA, p)
for (i in 1:p) {
  coefi = coef(regfit.full, id=i)
  pred = as.matrix(x.test[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %in% x_cols]#测试集的Y
  val.errors[i] = mean((y.test - pred)^2)#计算MSE
}
plot(val.errors, ylab="Test MSE", pch=19, type="b")

在这里插入图片描述

#way2
xmat=model.matrix(y~.,data=d)
mse1=rep(NA,20)
for(i in 1:20){
  pred=xmat[test,][,names(coefficients(
    fit1,id=i))]%*%coefficients(fit1,id=i)
  mse1[i]=mean((pred-y[test])^2)
}
mse1
plot(1:20,mse1,type="b",xlab="model size",ylab="test MSE")

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

e 当模型含有多少个特征时,测试集MSE最小。

#way1
which.min(val.errors)

16 parameter model has the smallest test MSE.

#way2
which.min(mse1)

在这里插入图片描述

15 parameter model has the smallest test MSE.

f 测试集MSE最小的模型与真实模型比较起来有何不同,比较模型系数。

#way1
coef(regfit.full, id=16)

在这里插入图片描述

Caught all but one zeroed out coefficient at x.2,x.4,x.10,x.19.

#way2
coefficients(fit1,id=15)

在这里插入图片描述
Caught all but one zeroed out coefficient at x.3,x.4,x.8,x.10,x.16.

g 作出 r r r在一定范围内取值时 ∑ j = 1 p ( β j − β ^ j r ) 2 \sqrt{\sum_{j=1}^p\left(\beta_j-\hat{\beta}_j^r\right)^2} j=1p(βjβ^jr)2 的图像,其中 β ^ j r \hat{\beta}_j^r β^jr为包含 r r r个预测变量的最优模型中第 j j j个系数的估计值。

#way1
val.errors = rep(NA, p)
a = rep(NA, p)
b = rep(NA, p)
for (i in 1:p) {
  coefi = coef(regfit.full, id=i)
  a[i] = length(coefi)-1
  b[i] = sqrt(
    sum((B[x_cols %in% names(coefi)] - coefi[names(coefi) %in% x_cols])^2) +
      sum(B[!(x_cols %in% names(coefi))])^2)
}
plot(x=a, y=b, xlab="number of coefficients",
     ylab="error between estimated and true coefficients")
which.min(b)

在这里插入图片描述
在这里插入图片描述

Model with 9 coefficients (10 with intercept) minimizes the error between the
estimated and true coefficients. Test error is minimized with 16 parameter model.
A better fit of true coefficients as measured here doesn’t mean the model will have.

#way2
xcol=colnames(x,do.NULL =F,prefix = "X")
s=rep(NA,20)
for(i in 1:20){
  s[i]=sqrt(sum(beta[xcol%in%names(coefficients(fit1,id=i)[-1])]-
                  coefficients(fit1,id=i)[-1])^2+
              sum(beta[!xcol%in%names(coefficients(fit1,id=i)[-1])])^2)
}

plot(1:20,s,type="b",xlab="numbers of coeffieients",
     ylab='error between estimated and true coefficients')
which.min(s)

在这里插入图片描述
在这里插入图片描述
Model with 15 coefficients (15 with intercept) minimizes the error between the
estimated and true coefficients. Test error is minimized with 15 parameter model.
A better fit of true coefficients as measured here doesn’t mean the model will have.

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:/a/918958.html

如若内容造成侵权/违法违规/事实不符,请联系我们进行投诉反馈qq邮箱809451989@qq.com,一经查实,立即删除!

相关文章

kafka基础

文章目录 一、Kafka入门1.1、JMS1.2、生产者-消费者模式1.3、ZooKeeper 二、kafka基础架构2.1、producer2.2、kafka cluster2.2.1、broker2.2.2、Controller2.2.3、Topic2.2.4、Partition2.2.5、Replication2.2.6、Leader & Follower 2.3、consumer 一、Kafka入门 Kafka是一…

HarmonyOs鸿蒙开发实战(10)=>状态管理-对象数组的属性数据变更刷新UI,基于@Observed 和@ObjectLink装饰器

1.条件:基于HarmonyOs5.0.0版本. 2.功能要求:横向列表中每个景点的名称(eg: 第二项 “灵隐寺” ), 在通过天气接口拿到对应天气后,拼接到名称后面 > 变成(“灵隐寺” 天气)) 3.老规矩先看…

诡异错误:返回给前端的id被前端自动修改

使用mybatis-plus生成的id,使用雪花算法,是一个long类型的id。 当调用list接口返回给前端后,接口显示数据正常,但是界面上的id不对,多了好几个0,数据都是以0结尾。 由于前端使用vue编写,我不太会…

Django5 2024全栈开发指南(一):框架简介、环境搭建与项目结构

目录 一、Python Web框架要点二、Django流程2.1 Django介绍2.1.1 简介2.1.2 特点2.1.3 MVT模式2.1.4 Django新特性2.1.5 Django学习资料 2.2 搭建Django框架开发环境2.2.1 安装Python语言环境2.2.2 安装Django框架 2.3 创建Django项目2.4 Pycharm创建项目2.5 初试Django52.5.1 …

大模型研究报告 | 2024年中国金融大模型产业发展洞察报告|附34页PDF文件下载

随着生成算法、预训练模型、多模态数据分析等AI技术的聚集融合,AIGC技术的实践效用迎来了行业级大爆发。通用大模型技术的成熟推动了新一轮行业生产力变革,在投入提升与政策扶植的双重作用下,以大模型技术为底座、结合专业化金融能力的金融大…

深入内核讲明白Android Binder【一】

深入内核讲明白Android Binder【一】 前言一、Android Binder应用编写概述二、基于C语言编写Android Binder跨进程通信Demo0. Demo简介1. 服务的管理者server_manager.c2. Binder服务端代码实现 test_service.c2.1 实现思路2.2 完整实现代码 3. Binder客户端代码实现 test_clie…

新一代API开发工具,让API调试更快 更简单

新一代API开发工具 代理调试 请求测试一站式解决方案 Reqable Fiddler Charles Postman, 让API调试更快 🚀 更简单 👌 直接上下载地址 根据系统,下载对应的版本即可 https://reqable.com/zh-CN/download/

LVGL-从入门到熟练使用

LVGL简介 LVGL( Light and Versatile Graphics Library )是一个轻量、多功能的开源图形库。 1、丰富且强大的模块化图形组件:按钮 、图表 、列表、滑动条、图片等 2、高级的图形引擎:动画、抗锯齿、透明度、平滑滚动、图层混合等…

从视频帧生成点云数据、使用PointNet++模型提取特征,并将特征保存下来的完整实现。

文件地址 https://github.com/yanx27/Pointnet_Pointnet2_pytorch?spm5176.28103460.0.0.21a95d27ollfze Pointnet_Pointnet2_pytorch\log\classification\pointnet2_ssg_wo_normals文件夹改名为Pointnet_Pointnet2_pytorch\log\classification\pointnet2_cls_ssg "E:…

时间序列关于可解释性值得关注的论文汇总-第2篇

前言 这是时序可解释性论文汇总的第二篇,第一篇见这里(后台回复:“论文合集”可直接获取整理的文章)。深度学习的可解释性研究一直是热门,而时间序列的可解释性同样非常重要。这是因为时序模型被大量应用到特定领域&a…

DataStream编程模型之数据源、数据转换、数据输出

Flink之DataStream数据源、数据转换、数据输出(scala) 0.前言–数据源 在进行数据转换之前,需要进行数据读取。 数据读取分为4大部分: (1)内置数据源; 又分为文件数据源; socket…

Java面试题2024-Java基础

Java基础 1、 Java语言有哪些特点 1、简单易学、有丰富的类库 2、面向对象(Java最重要的特性,让程序耦合度更低,内聚性更高) 3、与平台无关性(JVM是Java跨平台使用的根本) 4、可靠安全 5、支持多线程 2、…

数据结构(基本概念及顺序表——c语言实现)

基本概念: 1、引入 程序数据结构算法 数据: 数值数据:能够直接参加运算的数据(数值,字符) 非数值数据:不能够直接参加运算的数据(字符串、图片等) 数据即是信息的载…

使用爬虫获取的数据如何有效分析以优化店铺运营?

在数字化时代,数据已成为电商运营的核心。通过爬虫技术,我们可以从淘宝等电商平台获取大量数据,这些数据如果得到有效分析,将极大助力店铺运营的优化。本文将探讨如何使用爬虫技术获取数据,并利用数据分析来优化店铺运…

c++类对象练习

#include <iostream> #include <cstring>using namespace std;class mystring {char* buf; public:mystring(); //构造函数mystring(const char* str); //构造函数void show(); //输出函数void setmystr(const mystring str); //设置函数const char* getmystr() co…

后端:Spring AOP原理--动态代理

文章目录 1. Spring AOP底层原理2. 代理模式3. 静态代理4. 动态代理4.1 jdk 实现动态代理4.2 cglib 实现动态代理4.3 jdk、cglib动态代理两者的区别 1. Spring AOP底层原理 创建容器 new applicationContext()&#xff1b;Spring把所有的Bean进行创建&#xff0c;进行依赖注入…

微信小程序 最新获取用户头像以及用户名

一.在小程序改版为了安全起见 使用用户填写来获取头像以及用户名 二.代码实现 <view class"login_box"><!-- 头像 --><view class"avator_box"><button wx:if"{{ !userInfo.avatarUrl }}" class"avatorbtn" op…

【Linux】进程的状态详解

进程的状态详解 一、各种状态的概念二、运行状态的详细介绍三、阻塞状态详解四、挂起状态和阻塞状态的关系五、观察各种状态在linux中的表示1.运行态R2.睡眠态S3.暂停态T4.深度睡眠状态D5.僵尸状态Z6.孤儿进程 一、各种状态的概念 为了弄明白正在运行的进程是什么意思&#xf…

python高级之简单爬虫实现

一、前言 场景1&#xff1a;一个网络爬虫&#xff0c;顺序爬取一个网页花了一个小时&#xff0c;采用并发下载就减少到了20分钟。 场景2&#xff1a;一个应用软件优化前每次打开网页需要3秒&#xff0c;采用异步并发提升到了200毫秒。 假设一个工程的工作量为100&#xff0c…

web——upload-labs——第十关——.空格.绕过

审计源码 这次先删除文件名左右的空格&#xff0c;然后又删除了我们文件末尾的.&#xff0c;其次将我们上传的文件名转换为小写&#xff0c;删除文件末尾的::$DATA&#xff0c;最后又删除了文件名左右两侧的空格 根据他的逻辑&#xff0c;我们可以构造文件名phpinfo.php. .就是…