-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathComplete_Code.R
More file actions
132 lines (110 loc) · 4 KB
/
Complete_Code.R
File metadata and controls
132 lines (110 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Library/ R Packages
library(dplyr)
library(Biobase)
library(GenomicRanges)
library(SGSeq)
library(DESeq2)
####################################################################
# Import data sets
annot <- read.csv("annotation.csv",header = T, sep=",")
names(annot)<-c("Patient","Group")
gene.exp <- read.csv("data.csv",header = T,sep=",",row.names=1)
names(gene.exp)
dim(gene.exp)
head(annot)
####################################################################
# VISUALIZATION #
####################################################################
png("p1.png")
plot(sapply(gene.exp, sum)/1000000,col=(annot$Group+2),
xlab="subject",ylab="Sum of count (mil)",
main="Sum of all counts by subject")
legend(34,8.8,c("Placebo","Novel"),col=c(3,2),pch=c(20,20))
abline(h=mean(sapply(gene.exp, sum)/1000000),lwd=2,col=4)
dev.off()
groups <- c(rep("Placebo", 17), rep("Novel", 24))
se <- DESeqDataSetFromMatrix(countData = gene.exp,
colData = as.data.frame(groups) ,
design = ~ groups)
dds <- DESeqDataSetFromMatrix(countData = gene.exp,
colData = as.data.frame(groups) ,
design = ~ groups)
#Estimate size factors
dds <- estimateSizeFactors( dds )
sizeFactors(dds)
colSums(counts(dds))
#Plot column sums according to size factor
png("p2.png")
plot(sizeFactors(dds), colSums(counts(dds))/1000000, col=4,
xlab="scaling factor",ylab="Sum of reads count")
abline(lm(colSums(counts(dds))/1000000 ~ sizeFactors(dds) + 0))
title("Scaling factor for count data in each subject")
dev.off()
logcounts <- log2(counts(dds, normalized=TRUE)+1)
norm_count = counts(dds, normalized=TRUE)
# visualize after normalization
png("p3.png")
plot(colMeans(norm_count),col=annot$Group+2, xlab="subject",ylab="Mean of count",
main = "Mean of reads count after normalisation")
dev.off()
png("p4.png")
plot(rowMeans(norm_count),col=annot$Group+2, xlab="gene",ylab="Mean of count",
main = "Mean of reads count after normalisation")
legend(45000,95000,c("Placebo","Novel"),cex=0.8,col=c(3,2),pch=c(20,20))
dev.off()
library(factoextra)
pc1 = prcomp((norm_count))
#plot(pc1$rotation[,1],pc1$rotation[,2],col=annot$Group+3,pch=18,cex=1.2,
# xlab="PC1",ylab="PC2",main="Biplot for PCA")
png("p5.png")
fviz_eig(pc1)
dev.off()
png("p6.png")
groups <- as.factor(annot$Group)
fviz_pca_ind(pc1,
col.ind = groups, # color by groups
palette = c("#00AFBB", "#FC4E07"),
#addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE
)
dev.off()
# means versus variance of RNA-seq expression
png("var.png")
plot(rowMeans(norm_count),rowVars(norm_count),col=3,pch=18)
dev.off()
V####################################################################
# TESING DE GENES #
####################################################################
# VOOM transformation to apply linear regression and limma
library(edgeR)
png("voom1.png")
voom1 <- voom(norm_count, mod, plot = T)
dev.off()
d0 <- DGEList(norm_count)
# Filter low-expressed genes
cutoff <- 1
drop <- which(apply(cpm(d0), 1, max) < cutoff)
d <- d0[-drop,]
dim(d) # number of genes left
snames <- colnames(norm_count) # Sample names
mm <- model.matrix(~ groups)
png("voom.png")
par(mfrow=c(1,2))
tmp <- voom(d0, mm, plot = T)#, main="before fitering low expressed gene")
y <- voom(d, mm, plot = T)
dev.off()
# fitting linear model in limma
fit <- lmFit(y, mm)
head(coef(fit))
fit.e<-eBayes(fit)
top.table <- topTable(fit.e, sort.by = "p", n = Inf)
xtable(head(top.table, 10))
length(which(top.table$adj.P.Val < 0.05))
gene_names = select(org.Hs.eg.db, rownames(top.table)[1:7],
c("SYMBOL","GENENAME","ENTREZID"),"ENSEMBL")
gene <- mapIds(org.Hs.eg.db,keys=row.names(top.table)[1:7],
column="GENENAME", keytype="ENSEMBL",
multiVals="first")
#