\section{Processing of main SGI data} \subsection{Preliminaries} Load HD2013SGI library. <>= library(HD2013SGI) @ Load data and screen annotation. <>= data("datamatrixfull",package="HD2013SGI") D = datamatrixfull$D data("TargetAnnotation",package="HD2013SGI") data("QueryAnnotation",package="HD2013SGI") @ Create output directories. <>= dir.create(file.path("result","data"),recursive=TRUE,showWarnings=FALSE) dir.create(file.path("result","Figures"),recursive=TRUE,showWarnings=FALSE) dir.create(file.path("result","Tables"),recursive=TRUE,showWarnings=FALSE) @ Get indices for different sets of experiments (samples, controls, ...). <>= IndexSAMPLE = which(TargetAnnotation$group == "sample") IndexCTRL = which(TargetAnnotation$group == "SingleKDctrl") IndexNEG = which(TargetAnnotation$group == "negctrl") IndexSAMPLENEG = c(IndexSAMPLE, IndexNEG) @ Set the color value for controls. <>= colCTRL = rep("gray", nrow(TargetAnnotation)) colCTRL[TargetAnnotation$group == "SingleKDctrl"] = "royalblue" colCTRL[TargetAnnotation$group == "negctrl"] = "red" @ \subsection{Transform features and screen normalization} A function for a generalized log transformation. <>= logtrafo <- function(x,c) { log2((x+sqrt(x^2+c^2))/2) } @ Transform all features with a generalized log transformation. A 3 percent quantile is used as an additive constant in the glog transformation. <>= for (i in seq_len(dim(D)[5])) { m = quantile(D[,,,,i,],probs=0.03,na.rm=TRUE) D[,,,,i,] = logtrafo(D[,,,,i,],m) } @ Normalize median per plate and feature to compensate for global differences between replicates and siRNA designs. <>= M = apply(D,c(2:6),median,na.rm=TRUE) M2 = apply(M, c(2,4), mean) M2 = rep(M2, times=8) dim(M2) = dim(M)[c(2,4,1,3,5)] M2 = aperm(M2,c(3,1,4,2,5)) M = M - M2 M = rep(M[], each=dim(D)[1]) dim(M) = dim(D) D = D - M @ Subtract median and devide by median deviation per feature. <>= for (i in seq_len(dim(D)[5])) { D[,,,,i,] = (D[,,,,i,] - median(D[,,,,i,],na.rm=TRUE)) / mad(D[,,,,i,],na.rm=TRUE) } @ \subsection{Quality control of features}\label{QControlFeatures} The dimension of the data cube before quality control is\\ \begin{center} \begin{tabular}{lrrl} & \Sexpr{dim(datamatrixfull$D)[1]} & target genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[2]} & siRNA target designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[3]} & query genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[4]} & siRNA query designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[5]} & phenotypic features\\ $\times$ & \Sexpr{dim(datamatrixfull$D)[6]} & biological replicates \end{tabular} \end{center} Take the mean over all four siRNA design pairs and compute for each feature the Pearson correlation between first and second replicate. <>= C = rep(NA_real_,dim(D)[5]) D2 = (D[,1,,1,,] + D[,2,,1,,] + D[,1,,2,,] + D[,2,,2,,]) / 4 for (i in seq_len(dim(D)[5])) { C[i] = cor(as.vector(D2[IndexSAMPLE,,i,1]),as.vector(D2[IndexSAMPLE,,i,2])) } @ Plot the correlation coefficients for all \Sexpr{length(C)} features. <>= pdf(file=file.path("result","Figures","QCreproducibilityOfFeatures.pdf")) plot(-sort(-C),pch=20,xlab="features",ylab="correlation",ylim=c(0,1)) abline(h=0.6) dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/QCreproducibilityOfFeatures.pdf} \end{center} As an example the scatter plot of eccentricity measurements is plotted for two replicates. <>= pdf(file=file.path("result","Figures","QCscatterFeaturesBetweenReplicates.pdf")) i = "cell.Bact.m.eccentricity" plot(as.vector(D2[IndexSAMPLE,,i,1]),as.vector(D2[IndexSAMPLE,,i,2]), pch=20,cex=0.6,xlab="eccentricity replicate 1", ylab="eccentricity replicate 2") cc = cor(as.vector(D2[IndexSAMPLE,,i,1]),as.vector(D2[IndexSAMPLE,,i,2])) text(x=2,y=-4,sprintf("cor = %0.2f",cc)) dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/QCscatterFeaturesBetweenReplicates.pdf} \end{center} Write a table with correlation coefficients. <>= I = order(-C) write.table(data.frame(name=dimnames(D)[[5]][I],cor=C[I]), file=file.path("result","Tables","QC_reproducibilityOfFeatures.txt"), sep="\t",quote=FALSE,row.names=FALSE) @ Select all features with a correlation of at least 0.6. The number of features passing quality control is \Sexpr{sum(C >= 0.6, na.rm=TRUE)} out of \Sexpr{dim(D)[5]}. <>= I = which(C >= 0.6) D = D[,,,,I,,drop=FALSE] @ The dimension of the data cube after quality control of the phenotypic features is\\ \begin{center} \begin{tabular}{lrrl} & \Sexpr{dim(datamatrixfull$D)[1]} & target genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[2]} & siRNA target designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[3]} & query genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[4]} & siRNA query designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[5]} & phenotypic features\\ $\times$ & \Sexpr{dim(datamatrixfull$D)[6]} & biological replicates \end{tabular} \end{center} \subsection{Quality control of siRNA designs}\label{QControlGenes} For each target siRNA the phenotypic profiles are summarized over two query siRNA and two replicates. <>= D1 = (D[,,,1,,1] + D[,,,1,,2] + D[,,,2,,1] + D[,,,2,,2])/4 @ For each target gene the Pearson correlation coefficient is computed for phenotypic profiles between the two siRNA designs separately for each feature. The mean of correlation coefficients over all features is reported for each target gene. <>= Cdesign1 = rep(NA_real_,dim(D)[1]) for (k in seq_len(dim(D)[5])) { for (i in seq_len(dim(D)[1])) { Cdesign1[i] = cor(as.vector(D1[i,1,,k]),as.vector(D1[i,2,,k])) } if ( k == 1) { Cdesign1all = Cdesign1 } else { Cdesign1all = Cdesign1all + Cdesign1 } } Cdesign1all = Cdesign1all / dim(D)[5] @ Plot the correlation coefficients of phenotypic profiles between siRNAs for all target genes. <>= pdf(file.path("result","Figures","QCreproducibilityOfTargetsiRNA.pdf")) plot(-sort(-Cdesign1all[IndexSAMPLE]), pch=20, xlab="target genes",ylab="correlation",ylim=c(0,1)) abline(h=0.7) dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/QCreproducibilityOfTargetsiRNA.pdf} \end{center} All gene with a correlation of the phenotypic profiles of at least 0.7 are selected for further analysis. The number of target genes passing quality control is \Sexpr{sum(Cdesign1all[IndexSAMPLE] >= 0.7,na.rm=TRUE)} out of \Sexpr{length(IndexSAMPLE)}. Additionally, \Sexpr{length(IndexNEG)} negative control target siRNAs selected. <>= f = "cell.Bact.m.eccentricity" D1 = D[ TargetAnnotation$Symbol == "CHAF1A",,QueryAnnotation$Symbol == "DPF2",1,f,] pdf(file.path("result","Figures","QCreproducibilityOfTargetsiRNAexample.pdf"),width=2,height=4) par(xpd=NA) bp = barplot(apply(D1,1,mean),ylab="Cell eccentricity", names.arg=c("CHAF1A #1\nDPF2 #1","CHAF1A #2\nDPF2 #1"),las=2) points(bp,D1[,1]) points(bp,D1[,2]) dev.off() @ \begin{center} \includegraphics[width=0.25\textwidth]{result/Figures/QCreproducibilityOfTargetsiRNAexample.pdf} \end{center} <>= I = IndexSAMPLE[Cdesign1all[IndexSAMPLE] >= 0.7] I = c(I, IndexNEG) D = D[I,,,,,,drop=FALSE] TargetAnnotation = TargetAnnotation[I,] @ The dimension of the data cube quality control of the siRNA designs is\\ \begin{center} \begin{tabular}{lrrl} & \Sexpr{dim(datamatrixfull$D)[1]} & target genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[2]} & siRNA target designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[3]} & query genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[4]} & siRNA query designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[5]} & phenotypic features\\ $\times$ & \Sexpr{dim(datamatrixfull$D)[6]} & biological replicates \end{tabular} \end{center} \subsection{Selection of non-redundant features}\label{featureSelection} The \Sexpr{dim(D)[5]} features are highly redundant. We select a set of non-redundant features. The phenotypic profiles are summarized over all siRNA designs and control measurements are excluded for feature selection. <>= D1 = (D[,1,,1,,] + D[,1,,2,,] + D[,2,,1,,] + D[,2,,2,,]) / 4 D1 = D1[TargetAnnotation$group == "sample",,,] @ 3000 perturbations are randomly selected to speed up the selection process. <>= dim(D1) = c(prod(dim(D1)[1:2]),dim(D1)[3:4]) D1 = aperm(D1,c(1,3,2)) set.seed(5830458) Sample = sample(seq_len(dim(D1)[1]), 3000) subSampleForStabilitySelection = list(D = D1[Sample,,], Sample = Sample, phenotype = dimnames(D)[[5]]) @ Non-redundant features are selected. The number-of-cells is manually preselected because of its biological interest. All other features are selected automatically in a sequential way. In each step all candidate features are evaluated separately. A linear regression is fitted on the previously selected features. The residuals are a composition of random noise and biological information that is non-redundant to the previously selected features. To estimate if there still exists unexplained biological signal, the Pearson correlation coefficient of the residuals between the two biological replicates is computed. The feature with the largest correlation coefficient is selected. <>= stabilitySelection = HD2013SGIselectByStability(subSampleForStabilitySelection, preselect = c("count"), Rdim = 25, verbose = TRUE) @ The step-wise approach is stopped, when the number of positive correlation coefficients is smaller or equal to the number of negative correlation coefficients. This criterion is motivated by the observation that for random data, half of the correlation coefficients is expected to be positive and the other half is expected to be negative. <>= Sel = stabilitySelection$ratioPositive >= 0.5 @ A set of \Sexpr{sum(Sel)} features is shown to contain non-redundant information. At first we plot the correlation coefficients of the residual features which is considered to be the information content. <>= col = brewer.pal(3,"Pastel1")[1:2] pdf(file.path("result","Figures","FeatureSelectionInformationgain.pdf")) par(mar=c(13,4,1,1)) barplot(stabilitySelection$correlation, names.arg=HD2013SGI:::humanReadableNames[stabilitySelection$selected], col=ifelse(Sel, col[2], col[1]), ylim=c(0,1),las=2,ylab="information gain") dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/FeatureSelectionInformationgain.pdf} \end{center} Now we plot the fraction of positive correlation coefficients that serves as a stop criterion. <>= pdf(file.path("result","Figures","FeatureSelectionRatioPositive.pdf")) par(mar=c(13,4,1,1)) barplot(stabilitySelection$ratioPositive-0.5, names.arg=HD2013SGI:::humanReadableNames[stabilitySelection$selected], offset=0.5,col=ifelse(Sel, col[2], col[1]),ylim=c(0,1), las=2,ylab="fraction positive correlated") dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/FeatureSelectionRatioPositive.pdf} \end{center} The criteria for feature selection are saved and the features are selected. <>= save(stabilitySelection, file=file.path("result","data", "stabilitySelection.rda")) D = D[,,,,stabilitySelection$selected[Sel],,drop=FALSE] dimnames(D)[[1]] = TargetAnnotation$Symbol dimnames(D)[[3]] = QueryAnnotation$Symbol @ After selecting \Sexpr{sum(Sel)} non-redundant features, the dimension of the data cube is\\ \begin{center} \begin{tabular}{lrrl} & \Sexpr{dim(datamatrixfull$D)[1]} & target genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[2]} & siRNA target designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[3]} & query genes \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[4]} & siRNA query designs \\ $\times$ & \Sexpr{dim(datamatrixfull$D)[5]} & phenotypic features\\ $\times$ & \Sexpr{dim(datamatrixfull$D)[6]} & biological replicates \end{tabular} \end{center} The datamatrix is now completely pre-processed and can be saved. <>= datamatrix = list(D=D, Anno = list(target = TargetAnnotation, query = QueryAnnotation, phenotype=dimnames(D)[[5]])) save(datamatrix, file=file.path("result","data","datamatrix.rda")) @ \subsection{Pairwise interaction scores} Pairwise interaction scores ($\pi$=scores) are estimated using a robust linear fit. The query main effects are lifted such that they equal to the mean of the single knock down measurements of the query genes (target siRNA is a scrambled sequence serving as negative control). <>= D = datamatrix$D pimatrix = datamatrix pimatrix$D[] = NA_real_ mainEffects = list(target = D[,,1,,,], query = D[1,,,,,], overall = D[1,,1,,,], Anno = datamatrix$Anno) for (i in seq_len(dim(D)[2])) { for (j in seq_len(dim(D)[4])) { for (k in seq_len(dim(D)[5])) { for (l in seq_len(dim(D)[6])) { MP = HD2013SGImaineffects(D[,i,,j,k,l], TargetNeg=which(TargetAnnotation$group == "negctrl")) pimatrix$D[,i,,j,k,l] = MP$pi mainEffects$target[,i,j,k,l] = MP$targetMainEffect mainEffects$query[i,,j,k,l] = MP$queryMainEffect mainEffects$overall[i,j,k,l] = MP$neg } } } } save(mainEffects, file=file.path("result","data","mainEffects.rda")) @ \subsection{Statistical testing of interaction terms} For statistical testing the interaction terms that differ by 4 times the median deviation of the interaction scores are flagged as outliers. Afterwards the interaction terms are tested for significance by a moderated t-test implemented in the R-package limma. <>= D = pimatrix$D PADJ = D[pimatrix$Anno$target$group == "sample",,,,,1] s = rep(NA_real_, dim(D)[5]) for (i in seq_len(dim(D)[5])) { Data = D[,,,,i,] Data = Data[pimatrix$Anno$target$group == "sample",,,,] d = dim(Data) dim(Data) = c(prod(d[1:4]),prod(d[5])) Data[abs(Data[,1]-Data[,2]) > 4*mad(Data[,1]-Data[,2],center=0.0),]=NA_real_ s[i] = median(apply(Data,1,sd), na.rm=TRUE) padj = rep(NA_real_, nrow(Data)) K = which(apply(!is.na(Data),1,all)) fit = eBayes(lmFit(Data[K,])) padj[K] = p.adjust(fit$p.value, method="BH") PADJ[,,,,i] = padj cat("i=",i," nr int (1%) = ",sum(padj <= 0.01,na.rm=TRUE)/nrow(Data), " nr int (3%) = ",sum(padj <= 0.03,na.rm=TRUE)/nrow(Data),"\n") } @ $\pi$-scores for each siRNA pair are summarized over both replicates. Furthermore interaction scores are divided by their median deviation over all siRNA pairs, to provide comparable measurements. <>= PI = pimatrix$D PI = PI[pimatrix$Anno$target$group == "sample",,,,,] Interactions = list(newpiscore = PI, scale = s, padj = PADJ, Anno = pimatrix$Anno) Interactions$Anno$target = Interactions$Anno$target[ pimatrix$Anno$target$group == "sample",] save(Interactions, file=file.path("result","data","Interactions.rda")) @