You will form five selected groups of size 4-5 persons. You will take 5-7 minutes to sketch written plans of attack on the following questions and then submit your plans for discussion and demonstration.
load("montpick_eset.RData")
montpick.eset
## Error: object 'montpick.eset' not found
Remove pure zeroes.
pz = apply(exprs(montpick.eset),1,
  function(x) all(x==0))
mnz = montpick.eset[-which(pz),]
There is some additional scrubbing needed…
dim(mnz)
## Features  Samples 
##    12160      129
Simple clustering approach.
hc = hclust(dist(t(exprs(mnz))))
table(cutree(hc, k=2))
## 
##   1   2 
## 128   1
table(cutree(hc, k=3))
## 
##  1  2  3 
## 84 44  1
table(cutree(hc, k=4))
## 
##  1  2  3  4 
## 84 33 11  1
Try with simple logarithm after shift by one.
hc.l = hclust(dist(t(log(exprs(mnz)+1))))
table(cutree(hc.l, k=2))
## 
##   1   2 
## 122   7
table(cutree(hc.l, k=3))
## 
##  1  2  3 
## 30 92  7
table(cutree(hc.l, k=4))
## 
##  1  2  3  4 
## 30 92  6  1
Try PAM:
library(cluster)
pam.2 = pam(dist(t(log(exprs(mnz)+1))), k=2)
pam.3 = pam(dist(t(log(exprs(mnz)+1))), k=3)
pam.4 = pam(dist(t(log(exprs(mnz)+1))), k=4)
table(pam.4$clust)
## 
##  1  2  3  4 
## 45 20 49 15
Try K-means:
km.2 = kmeans(t(log(exprs(mnz)+1)), 2)
km.3 = kmeans(t(log(exprs(mnz)+1)), 3)
table(km.2$clust)
## 
##  1  2 
## 77 52
table(km.3$clust)
## 
##  1  2  3 
## 33 60 36
Try kmeans with the voom transformation of the counts.
library(limma)
vmnz = voom(exprs(mnz))
txd = vmnz$E
lkm2v = kmeans(t(txd),2)
lkm4v = kmeans(t(txd),4)
table(lkm2v$clust)
## 
##  1  2 
## 63 66
table(lkm4v$clust)
## 
##  1  2  3  4 
## 15 43 60 11
Consider 'clusterStability' measure.
Try a DESeq analysis.
se = SummarizedExperiment(assays=list(counts=exprs(mnz)))
colData(se) = DataFrame(pData(mnz))
de = DESeqDataSet(se, ~population)
derun = DESeq(de)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 271 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds))
## estimating dispersions
## fitting model and testing
res = results(derun)
summarizeResults(res)
## 
## out of 12160 with nonzero total read count
## 
## at adjusted p-value < 0.1:
## 
## LFC > 0 (up)   : 3143, 26% 
## LFC < 0 (down) : 2975, 24% 
## 
## filtered *     : 1216, 10% 
## flagged **     : 0, 0% 
## 
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()
res[which(res$padj < .005),]
## log2 fold change (MAP): population YRI vs CEU 
## Wald test p-value: population YRI vs CEU 
## DataFrame with 3804 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419    41.785         0.5559   0.09824     5.658 1.528e-08
## ENSG00000000457    48.692         0.4202   0.09047     4.645 3.400e-06
## ENSG00000002330    65.562        -0.6426   0.08949    -7.180 6.966e-13
## ENSG00000002745     1.779         1.7037   0.40252     4.233 2.310e-05
## ENSG00000002834   155.210        -0.6368   0.10974    -5.803 6.516e-09
## ...                   ...            ...       ...       ...       ...
## ENSG00000253873     8.008        -0.6856    0.1749    -3.920 8.849e-05
## ENSG00000253954     9.913        -0.6413    0.1505    -4.262 2.023e-05
## ENSG00000254004     4.520         1.0417    0.1723     6.045 1.492e-09
## ENSG00000254128     9.615         1.0443    0.1772     5.894 3.774e-09
## ENSG00000254206     1.843         2.2559    0.3519     6.410 1.452e-10
##                      padj
##                 <numeric>
## ENSG00000000419 1.151e-07
## ENSG00000000457 1.732e-05
## ENSG00000002330 1.015e-11
## ENSG00000002745 1.007e-04
## ENSG00000002834 5.190e-08
## ...                   ...
## ENSG00000253873 3.470e-04
## ENSG00000253954 8.945e-05
## ENSG00000254004 1.331e-08
## ENSG00000254128 3.120e-08
## ENSG00000254206 1.511e-09
Is this sensitive to technical artifacts? Use SVA on log-transformed data and check.
num = num.sv(log(exprs(mnz)+1), 
  mod=model.matrix(~population,data=pData(mnz)), method="leek")
sv1 = sva(log(exprs(mnz)+1), 
   mod=model.matrix(~population,data=pData(mnz)), n.sv=num)
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
print(num)
## [1] 1
summary(sv1$sv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1600 -0.0633 -0.0220  0.0000  0.0503  0.3120
secorr = se
secorr$sv = sv1$sv
decorr = DESeqDataSet(secorr, ~sv+population)
derun2 = DESeq(decorr)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
corres = results(derun2)
summarizeResults(corres)
## 
## out of 12160 with nonzero total read count
## 
## at adjusted p-value < 0.1:
## 
## LFC > 0 (up)   : 3161, 26% 
## LFC < 0 (down) : 3006, 25% 
## 
## filtered *     : 1216, 10% 
## flagged **     : 0, 0% 
## 
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()
corres[which(corres$padj < .005),]
## log2 fold change (MAP): population YRI vs CEU 
## Wald test p-value: population YRI vs CEU 
## DataFrame with 3818 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419    41.785         0.5780   0.09948     5.810 6.247e-09
## ENSG00000000457    48.692         0.3842   0.09019     4.260 2.040e-05
## ENSG00000002330    65.562        -0.6635   0.09199    -7.213 5.487e-13
## ENSG00000002745     1.779         1.7222   0.40362     4.267 1.982e-05
## ENSG00000002834   155.210        -0.7126   0.11082    -6.430 1.277e-10
## ...                   ...            ...       ...       ...       ...
## ENSG00000253873     8.008        -0.6432    0.1780    -3.612 3.034e-04
## ENSG00000253954     9.913        -0.6010    0.1526    -3.938 8.220e-05
## ENSG00000254004     4.520         1.0304    0.1752     5.880 4.095e-09
## ENSG00000254128     9.615         1.0214    0.1800     5.673 1.400e-08
## ENSG00000254206     1.843         2.3030    0.3464     6.647 2.983e-11
##                      padj
##                 <numeric>
## ENSG00000000419 4.947e-08
## ENSG00000000457 8.866e-05
## ENSG00000002330 7.980e-12
## ENSG00000002745 8.639e-05
## ENSG00000002834 1.302e-09
## ...                   ...
## ENSG00000253873 1.045e-03
## ENSG00000253954 3.181e-04
## ENSG00000254004 3.337e-08
## ENSG00000254128 1.048e-07
## ENSG00000254206 3.281e-10
Approaches to classification
library(randomGLM)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:genefilter':
## 
##     area
## 
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## 
## Loading required package: foreach
## foreach: simple, scalable parallel programming from Revolution Analytics
## Use Revolution R for scalability, fault tolerance and more.
## http://www.revolutionanalytics.com
## Loading required package: doParallel
## Loading required package: iterators
mysamp = sample(1:129, size=75)
train = mnz[,sort(mysamp)]
test = mnz[,-sort(mysamp)]
y = 1*(train$pop == "YRI")
## Warning: Name partially matched in data frame
rglm1 = randomGLM( t(exprs(train)), y, xtest=t(exprs(test)) )
table(rglm1$predictedTest, test$pop)
## Warning: Name partially matched in data frame
##    
##     CEU YRI
##   0  22   3
##   1   0  29
An alternate approach to ensemble learning
library(bigrf)
## Loading required package: bigmemory
## Loading required package: bigmemory.sri
## Loading required package: BH
## 
## bigmemory >= 4.0 is a major revision since 3.1.2; please see packages
## biganalytics and and bigtabulate and http://www.bigmemory.org for more information.
isyri = as.integer(1*(train$pop=="YRI")+1)
## Warning: Name partially matched in data frame
bb = bigrfc(data.frame(t(exprs(train))), isyri)
## OOB errors:
##  Tree  Overall error  Error by class
##                           1      2
##    10          12.00   5.26  18.92
##    20          10.67   7.89  13.51
##    30           6.67   5.26   8.11
##    40           6.67   2.63  10.81
##    50           5.33   2.63   8.11
bigpr = predict(bb, x=data.frame(t(exprs(test))))
## Processing tree number:
##    10
##    20
##    30
##    40
##    50
table(bigpr, test$pop)
## Warning: Name partially matched in data frame
##      
## bigpr CEU YRI
##     1  22   3
##     2   0  29