edgeR
edgeR是非常经典的转录组表达差异分析软件。
样本量:72个转录组样本
library(edgeR)
library(HTSFilter)
fc <- read.table('counts.txt',header=1,row.names="Geneid") #counts矩阵,有表头
group <- as.factor(strsplit2(colnames(fc),"_")[,1]) #分组信息,与Coutns文件列名相应
y <- DGEList(counts=fc,group=group)
keep <- rowSums(cpm(y)>1)>=2 #过滤,至少在2个样本中的cpm值大于1,由于样本量较多,这里取了比较宽松的阈值。
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
design <- model.matrix(~0+group)
colnames(design) <- levels(group)
# 设计比较矩阵,格式“name1 = Case - Control”,Case/Control 的名字与group 因子相同
mycontrasts <- makeContrasts(C1vsC0=C1-C0,C2vsC1=C2-C1,C3vsC2=C3-C2,C4vsC3=C4-C3,C5vsC4=C5-C4,G1vsG0=G1-G0,G2vsG1=G2-G1,G3vsG2=G3-G2,G4vsG3=G4-G3,G5vsG4=G5-G4,STS1vsSTS0=STS1-STS0,STS2vsSTS1=STS2-STS1,STS3vsSTS2=STS3-STS2,STS4vsSTS3=STS4-STS3,STS5vsSTS4=STS5-STS4,YZ1vsYZ0=YZ1-YZ0,YZ2vsYZ1=YZ2-YZ1,YZ3vsYZ2=YZ3-YZ2,YZ4vsYZ3=YZ4-YZ3,YZ5vsYZ4=YZ5-YZ4,C0vsG0=C0-G0,C1vsG1=C1-G1,C2vsG2=C2-G2,C3vsG3=C3-G3,C4vsG4=C4-G4,C5vsG5=C5-G5,YZ0vsG0=YZ0-G0,YZ1vsG1=YZ1-G1,YZ2vsG2=YZ2-G2,YZ3vsG3=YZ3-G3,YZ4vsG4=YZ4-G4,YZ5vsG5=YZ5-G5,C0vsSTS0=C0-STS0,C1vsSTS1=C1-STS1,C2vsSTS2=C2-STS2,C3vsSTS3=C3-STS3,C4vsSTS4=C4-STS4,C5vsSTS5=C5-STS5,YZ0vsSTS0=YZ0-STS0,YZ1vsSTS1=YZ1-STS1,YZ2vsSTS2=YZ2-STS2,YZ3vsSTS3=YZ3-STS3,YZ4vsSTS4=YZ4-STS4,YZ5vsSTS5=YZ5-STS5,C0vsYZ0=C0-YZ0,C1vsYZ1=C1-YZ1,C2vsYZ2=C2-YZ2,C3vsYZ3=C3-YZ3,C4vsYZ4=C4-YZ4,C5vsYZ5=C5-YZ5,G0vsSTS0=G0-STS0,G1vsSTS1=G1-STS1,G2vsSTS2=G2-STS2,G3vsSTS3=G3-STS3,G4vsSTS4=G4-STS4,G5vsSTS5=G5-STS5,levels=design)
y <- estimateDisp(y,design)
fit <- glmQLFit(y,design)
#一个函数用于批量导出比对结果
function_edgeR <- function(i) {
acontrast <- mycontrasts[,i]
qlf <- glmQLFTest(fit,contrast=acontrast)
write.table(qlf$table, file=paste(colnames(mycontrasts)[i],'txt',sep='.'),row.names=TRUE,col.names=TRUE,quote=F)
}
mapply(function_edgeR,i=seq(1,length(colnames(mycontrasts))))
#输出CPM
write.table(cpm(y), file="cpm_all.txt",row.names=TRUE,col.names=TRUE,quote=F)
#输出FPKM
gene <- read.table('exon_all_length.txt', header=1) #输入转录本长度(总exon长度)
rownames(gene) <- gene$gene
y$genes <- data.frame(Length=gene[rownames(y),]$length) #添加到 DEG List; "genes","Length" 名称固定
write.table(rpkm(y), file="fpkm_all.txt",row.names=TRUE,col.names=TRUE,quote=F)
附:计算外显子长度
library(GenomicFeatures)
gff3 <- makeTxDbFromGFF('annotation.gff3',format='gff3') #注释文件中一定要有外显子信息
exons <- exonsBy(gff3,by='gene') #外显子位置
exons_lens <- lapply(exons,function(x){sum(width(reduce(x)))}) #去除overlap
exons_lens_t <- t(as.data.frame(exons_lens)) # l转化为数据框
write.table(exons_lens_t, file="exon_all_length.txt",row.names=TRUE,col.names=TRUE,quote=F)
附:edgeR结果整理
library(fdrtool)
lf <- list.files(pattern="vs") # 比对结果文件
# 计算qvalue并输出
fdr_function <- function(name){
contr <- substr(name,1,6) #文件前缀
f <- read.table(name,header=1)
fdr <- fdrtool(f$PValue,statistic='pvalue',plot = F)
f$qvalue <- fdr$qval
write.table(f,file=paste(contr,"qvalue.txt",sep = "."),quote=F,row.names=T,col.names=T,sep='\t')
}
mapply(fdr_function, name=lf,SIMPLIFY = F)
#bash
#提取差异基因 |logFC| >=1 && p <0.05
cat ../contrast.txt |while read id;do awk -v contr=$id 'BEGIN{print "gene\tlogFC\tPValue\tqvalue\tcontrast"}($2 >=1 || $2 <= -1) && $5 < 0.05 {print $1"\t"$2"\t"$5"\t"$6"\t"contr}' $id.qvalue.txt >DEG_logfc1_p0.05/$id.DEG.txt;done
cat ../contrast.txt |while read id;do tail -n +2 $id.DEG.txt|awk -v n=$id '{print $1"\t"$2"\t"$5"\t"n}' >>DEG_all.txt;done
#添加表达注释信息,up/down/false
cat ../contrast.txt |while read id;do tail -n +2 $id.qvalue.txt|awk 'BEGIN {print "ID\tlog2FoldChange\tpvalue\tpadj\tregulated"} {if ($2 >=1 && $5 < 0.05) {print $1"\t"$2"\t"$5"\t"$6"\tup"} else if ($2 <= -1 && $5 < 0.05) {print $1"\t"$2"\t"$5"\t"$6"\tdown"} else {print $1"\t"$2"\t"$5"\t"$6"\tfalse"}}' >DEG_logfc1_p0.05_regulated/$id.regulated.txt;done