文件在单细胞\5_GC_py\1_single_cell\1.GSE183904.Rmd
GSE183904
数据原文
1.获取临床信息
筛选样本可以参考临床信息
rm(list = ls())
library(tinyarray)
a = geo_download("GSE183904")$pd
head(a)
table(a$Characteristics_ch1) #统计各样本有多少
2.批量读取
学会如何读取特定的样本
if(!file.exists("f.Rdata")){
#untar("GSE183904_RAW.tar",exdir = "GSE183904_RAW")
fs = dir("GSE183904_RAW/")[c(2,7)] #dir("GSE183904_RAW/"),列出所有文件
#为了省点内存只做2个样本,去掉[c(2,7)]即做全部样本
f = lapply(paste0("GSE183904_RAW/",fs),read.csv,row.names = 1)
#row.names = 1写在lapply的括号里,但是它是read.csv的参数
fs = stringr::str_split_i(fs,"_",1)
names(f) = fs
save(f,file = "f.Rdata")
}
load("f.Rdata")
library(Seurat)
scelist = list()
for(i in 1:length(f)){
scelist[[i]] <- CreateSeuratObject(counts = f[[i]],
project = names(f)[[i]])
print(dim(scelist[[i]]))
}
sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all) #连接数据
head(sce.all@meta.data)
table(sce.all$orig.ident)
3.质控指标
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")
head(sce.all@meta.data, 3)
VlnPlot(sce.all,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rp",
"percent.hb"),
ncol = 3,pt.size = 0, group.by = "orig.ident")
4.整合降维聚类分群
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
sce.all = sce.all %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(pc.genes = VariableFeatures(.)) %>%
RunHarmony("orig.ident") %>% #RunHarmony 包,整合多个样本,处理多样本的必备步骤
FindNeighbors(dims = 1:15, reduction = "harmony") %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15,reduction = "harmony") %>%
#reduction = "harmony"必须写上
RunTSNE(dims = 1:15,reduction = "harmony")
save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
UMAPPlot(sce.all,label = T)
TSNEPlot(sce.all,label = T)
5.手动注释
markers = read.delim("GCmarker.txt",header = F,sep = ";")
library(tidyr)
markers = separate_rows(markers,V2,sep = ",") #拆分marker
markers = split(markers$V2,markers$V1)
DotPlot(sce.all,features = markers,cols = "RdYlBu")+
RotatedAxis()
ggplot2::ggsave("dotplot.png",height = 10,width = 25)
writeLines(paste0(as.character(0:13),","))
names(markers)
celltype = read.csv("celltype.csv",header = F) #自己照着DotPlot图填的
celltype
new.cluster.ids <- celltype$V2
names(new.cluster.ids) <- levels(sce.all)
seu.obj <- RenameIdents(sce.all, new.cluster.ids)
save(seu.obj,file = "seu.obj.Rdata")
p1 <- DimPlot(seu.obj,
reduction = "tsne",
label = TRUE,
pt.size = 0.5) + NoLegend()
p1
6.自动注释
SingleR完成自主注释,不同的是scRNA = sce.all
library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
ref <- celldex::BlueprintEncodeData()
save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA@layers$data
rownames(test) = Features(scRNA)
colnames(test) = Cells(scRNA)
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
#查看注释准确性
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
p2 <- DimPlot(scRNA, reduction = "tsne",label = T,pt.size = 0.5) + NoLegend()
p1+p2
7.marker基因
找不同细胞类型间的差异基因
f = "markers.Rdata"
if(!file.exists(f)){
allmarkers <- FindAllMarkers(seu.obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
save(allmarkers,file = f)
}
load(f)
head(allmarkers)
如果想自行修改orig.ident:使用下边的代码:
sce.all@meta.data$orig.ident=rep(c("a","b"),times= c(ncol(scelist[[1]]),
ncol(scelistl[[2]])))