## ---- message=FALSE------------------------------------------------------ library(QuaternaryProd) # Compute the probability mass function pmf <- QP_Pmf(q_p = 20, q_m = 20, q_z = 20, q_r = 0, n_p = 20, n_m = 20, n_z = 20) # Plot the mass function plot(names(pmf), pmf, col="blue", xlab = "scores", ylab = "probabilities") lines(names(pmf), pmf, col = "blue") ## ---- message=FALSE------------------------------------------------------ # Get the p-value of score 5 pval <- QP_Pvalue(score = 5, q_p = 20, q_m = 20, q_z = 20, q_r = 0, n_p = 20, n_m = 20, n_z = 20) pval # Compue the p-value only if it is statistically significant otherwise # return -1 pval <- QP_SigPvalue(score = 5, q_p = 20, q_m = 20, q_z = 20, q_r = 0, n_p = 20, n_m = 20, n_z = 20) pval ## ---- message=FALSE------------------------------------------------------ library(QuaternaryProd) library(readr) library(org.Hs.eg.db) library(dplyr) library(stringr) library(fdrtool) # Get the full file name containing the STRINGdb relations ff <- system.file("extdata", "9606.protein.actions.v10.txt.gz", package="QuaternaryProd") all_rels <- read_tsv(gzfile(ff), col_names = TRUE) ## ---- message=FALSE------------------------------------------------------ # Set new names for columns names(all_rels) <- c("srcuid", "trguid", "mode", "action", "direction","score") Rels <- all_rels[, c("srcuid", "trguid", "mode", "direction")] # Get all rows with causal relations Rels <- Rels[Rels$mode %in% c("activation", "inhibition","expression"),] # Get causal relations where direction is not specified, and consider reversed # direction of causality as a valid causal relation Bidirectional <- Rels[Rels$direction == 0 , c("trguid", "srcuid", "mode", "direction")] names(Bidirectional) <- c("srcuid", "trguid", "mode", "direction") Rels <- unique(bind_rows(Rels, Bidirectional)) Rels$direction <- NULL # Rename activation as increases, inhibition as decreases, expression # as regulates Rels$mode <- sub("activation", "increases", Rels$mode) Rels$mode <- sub("inhibition", "decreases", Rels$mode) Rels$mode <- sub("expression", "regulates", Rels$mode) Rels <- unique(Rels) # Get a subset of the network: Skip this step if you want the p-values # of the scores corresponding to the source nodes computed over the # entire network. Rels <- Rels[sample(1:nrow(Rels), 40000, replace=FALSE),] ## ---- message=FALSE------------------------------------------------------ # Get all unique protein ensemble ids in the causal network allEns <- unique(c(Rels$srcuid, Rels$trguid)) # Map ensemble protein ids to entrez gene ids map <- org.Hs.egENSEMBLPROT2EG id <- unlist(mget(sub("9606.","",allEns), map, ifnotfound=NA)) id[is.na(id)] <- "-1" uid <- paste("9606.", names(id), sep="") # Function to map entrez ids to gene symbols map <- org.Hs.egSYMBOL symbol <- unlist(mget(id, map, ifnotfound=NA)) symbol[is.na(symbol)] <- "-1" # Create data frame of STRINGdb protein Id, entrez id and gene symbol and type of entity Ents <- data_frame(uid, id, symbol, type="protein") Ents <- Ents[Ents$uid %in% allEns,] # Remove ensemble ids in entities with duplicated entrez id Ents <- Ents[!duplicated(Ents$id),] # Add mRNAs to entities uid <- paste("mRNA_", Ents$uid, sep = "") mRNAs <- data_frame(uid=uid, id=Ents$id, symbol=Ents$symbol, type="mRNA") Ents <- bind_rows(Ents, mRNAs) ## ---- message=FALSE------------------------------------------------------ # Get all unique relations Rels$trguid <- paste("mRNA_", Rels$trguid, sep="") Rels <- Rels[Rels$srcuid %in% Ents$uid & Rels$trguid %in% Ents$uid,] Rels <- unique(Rels) # Leave source proteins which contain at least 10 edges sufficientRels <- group_by(Rels, srcuid) %>% summarise(count=n()) sufficientRels <- sufficientRels %>% filter(count > 10) Rels <- Rels %>% filter(srcuid %in% sufficientRels$srcuid) ## ---- message=FALSE------------------------------------------------------ # Gene expression data evidence1 <- system.file("extdata", "e2f3_sig.txt", package = "QuaternaryProd") evidence1 <- read.table(evidence1, sep = "\t", header = TRUE, stringsAsFactors = FALSE) evidence2 <- system.file("extdata", "myc_sig.txt", package = "QuaternaryProd") evidence2 <- read.table(evidence2, sep = "\t", header = TRUE, stringsAsFactors = FALSE) evidence3 <- system.file("extdata", "ras_sig.txt", package = "QuaternaryProd") evidence3 <- read.table(evidence3, sep = "\t", header = TRUE, stringsAsFactors = FALSE) # Remove duplicated entrez ids in evidence and rename column names appropriately names(evidence1) <- c("entrez", "pvalue", "fc") evidence1 <- evidence1[!duplicated(evidence1$entrez),] names(evidence2) <- c("entrez", "pvalue", "fc") evidence2 <- evidence2[!duplicated(evidence2$entrez),] names(evidence3) <- c("entrez", "pvalue", "fc") evidence3 <- evidence3[!duplicated(evidence3$entrez),] # Run Quaternary CRE for entire Knowledge base on new evidence # which computes the statistic for each of the source proteins CRE_results <- BioQCREtoNet(Rels, evidence1, Ents, is.Logfc = TRUE) # Get FDR corrected p-values CRE_results$pvalue <- fdrtool(CRE_results$pvalue, "pvalue", FALSE, FALSE, FALSE, "fndr")$q head(CRE_results[order(CRE_results$pvalue), c("uid","name","pvalue")])