## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( fig.align = "center", collapse = TRUE, comment = "#>", out.width = "80%", dpi = 150 ) ## ----setup, results=FALSE, message = FALSE------------------------------------ # Load and attach our package cloneRate library(cloneRate) # Load and attach ape library(ape) # Install from CRAN if necessary, then load and attach with library() library(ggplot2) library(survival) library(ggsurvfit) library(car) ## ----setColors---------------------------------------------------------------- colorPal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") ## ----dataPreview-------------------------------------------------------------- summary(cloneRate::realCloneData) ## ----dataSplit---------------------------------------------------------------- fullTrees.list <- cloneRate::realCloneData$fullTrees cloneTrees.list <- cloneRate::realCloneData$cloneTrees ## ----classPhylo--------------------------------------------------------------- # Let's see the class of the tree PD9478 <- fullTrees.list$PD9478_1 class(PD9478) ## ----firstTreePlot, fig.asp = 0.8, fig.width = 5.5---------------------------- # Plot the full tree from individual PD9478 at timepoint 1 PD9478 <- fullTrees.list$PD9478_1 # Plot, then add scale and title plot.phylo(PD9478, direction = "downwards", show.tip.label = FALSE, edge.width = 2, edge.color = c(rep("black", 2), "red", "#0070C0", rep("black", (200))) ) axisPhylo(side = 2, backward = FALSE, las = 1) title(main = "PD9478 full tree", ylab = "Time (years)") ## ----plotSubclone, fig.asp = 0.8, fig.width = 5.5----------------------------- # Load the clone tree from our cloneTrees.list PD9478_subClone <- cloneTrees.list[["PD9478_1_clone1"]] # Plot, then add scale and title plot.phylo(PD9478_subClone, direction = "downwards", show.tip.label = FALSE, edge.width = 2, ) axisPhylo(side = 2, backward = FALSE, las = 1) title(main = "PD9478 JAK2 & DNMT3A clone tree", ylab = "Time (years)") ## ----applyMethods------------------------------------------------------------- # Get maximum likelihood and internal lengths estimates print(round(maxLikelihood(PD9478_subClone)$estimate, 3)) print(round(internalLengths(PD9478_subClone)$estimate, 3)) ## ----applyLengths------------------------------------------------------------- # Apply each of our estimates resultsLengths <- internalLengths(cloneTrees.list) ## ----plotNonBinary, fig.asp = 0.8, fig.width = 5.5---------------------------- # Plot, highlighting the 3 descendants of node 76. Then add scale and title plot.phylo(cloneTrees.list$PD5847_1_clone1, edge.width = 2, direction = "downwards", show.tip.label = FALSE, edge.color = c( rep(1, 4), "darkorange", rep(1, 34), "darkorange", rep(1, 6), "darkorange", rep(1, 1000) ) ) axisPhylo(side = 2, backward = FALSE, las = 1) title(main = "PD5847 clone 1 tree (multi-branching)", ylab = "Time (years)") ## ----applyMaxLike------------------------------------------------------------- resultsMaxLike <- suppressWarnings(maxLikelihood(cloneTrees.list)) # Preview the output print(head(resultsMaxLike[, c(1:3)])) # Print correlation coefficient of the estimates from the two methods print(stats::cor.test(resultsLengths$estimate, resultsMaxLike$estimate)$estimate) ## ----applyBDMCMC, eval=FALSE-------------------------------------------------- # # Longer running code. Not run as part of vignette. # resultsBDMCMC <- suppressWarnings(birthDeathMCMC(cloneTrees.list)) # # # Print correlation coefficient of the estimates from the two methods # print(stats::cor.test(resultsBDMCMC$estimate, resultsMaxLike$estimate)$estimate) # # > cor # # > 0.9964975 ## ----columnsInfo-------------------------------------------------------------- # Check that column names are the same stopifnot(all(colnames(resultsMaxLike) == colnames(resultsLengths))) # Print column names colnames(resultsMaxLike) ## ----viewMetdata-------------------------------------------------------------- # Show the metadata of an individual without MPN print(cloneTrees.list$PD34493_clone1$metadata) # And the metadata of an individual with MPN print(cloneTrees.list$PD9478_1_clone1$metadata) ## ----combineMetdata----------------------------------------------------------- # Combine all metadata into a single data.frame metadataAll <- do.call(rbind, lapply(cloneTrees.list, function(x) { return(x$metadata) })) ## ----joinMetaResults---------------------------------------------------------- # Combine metadata with estimates using cbind. Check if cloneNames match resultsLengthsMeta <- cbind(resultsLengths, metadataAll) stopifnot(resultsLengthsMeta$cloneName_result == resultsLengthsMeta$cloneName_meta) resultsMaxLikeMeta <- cbind(resultsMaxLike, metadataAll) stopifnot(resultsLengthsMeta$cloneName_result == resultsLengthsMeta$cloneName_meta) # Because max. likelihood performs slightly better, use that going forward results <- resultsMaxLikeMeta ## ----splitMalNorm------------------------------------------------------------- # If patient ID and clone number are the same, even if timepoint # differs, then we have a duplicate. # Make a new column for patient, removing anything after "_" from cloneName_result results$patient <- gsub("_.*", "", results$cloneName_result) results$cloneNumber <- gsub(".*_clone", "", results$cloneName_result) # Combining patient ID (without timepoint) and clone number # will give us a unique ID for the clone regardless of sampling time results$uniqueCloneID <- paste0(results$patient, "_", results$cloneNumber) # Find which clone IDs appear twice tmp <- table(results$uniqueCloneID) repeatsVec <- names(tmp)[tmp == 2] # Remove the duplicate with fewer number of cells sequenced, n rowsRemove <- c() for (cloneID in repeatsVec) { duplicateRows <- which(results$uniqueCloneID == cloneID) removeIndex <- duplicateRows[which.min(results$n[duplicateRows])] rowsRemove <- c(rowsRemove, removeIndex) } uniqueResults <- results[!c(1:nrow(results)) %in% rowsRemove, ] # Check that each unique clone now only appears once (no duplicates) stopifnot(all(table(uniqueResults$uniqueCloneID) == 1)) ## ----meanInd, out.width = '50%'----------------------------------------------- # Get the unique individuals, and initialize vectors uniqueIndividuals <- unique(uniqueResults$patient) individualMeans <- c() malNorm <- c() # Fill vectors with mean growth rate and MPN status of each individual for (ind in uniqueIndividuals) { individualMeans <- c(individualMeans, mean(uniqueResults$estimate[uniqueResults$patient == ind])) malNorm <- c(malNorm, uniqueResults$malnorm[uniqueResults$patient == ind][1]) } # Combine results into a data.frame mal_vs_norm.df <- data.frame( "Patient" = uniqueIndividuals, "meanEstimate" = individualMeans, "malNorm" = malNorm ) # Run a non-parametric Mann-whitney test to see if they're significantly different nonparamTest <- wilcox.test(meanEstimate ~ malNorm, data = mal_vs_norm.df) print(nonparamTest) # Set factor ordering for plot and plot using ggplot (Fig. 5E) mal_vs_norm.df$malNorm <- factor(mal_vs_norm.df$malNorm, levels = c("Normal", "Malignant")) ggplot(mal_vs_norm.df, aes(x = malNorm, y = meanEstimate)) + geom_label(label = paste0("p = ", round(nonparamTest$p.value, 3)), x = 1.5, y = 1.2) + geom_boxplot(width = 0.3, aes(color = malNorm), outlier.shape = NA) + geom_jitter(aes(x = malNorm, y = meanEstimate, color = malNorm), width = .1, size = 2) + scale_color_manual(values = c("black", "red"), labels = c("Normal", "MPN")) + theme_bw() + ylab("Mean net growth rate (r)") + xlab("") + theme( legend.position = "none", axis.ticks.x = element_blank(), axis.title.x = element_blank() ) ## ----multiDriver, out.width = '50%'------------------------------------------- # Set the multi driver variable according to whether multiple drivers are annotated uniqueResults$multiDriver <- "Single or unknown" uniqueResults$multiDriver[grepl("AND", uniqueResults$cloneDriver)] <- "Multiple" # Run a non-parametric test to see if the differences are significant nonParamTest <- wilcox.test(estimate ~ multiDriver, data = uniqueResults) print(nonParamTest) # Set factor levels to control plot order uniqueResults$multiDriver <- factor(uniqueResults$multiDriver, levels = c("Single or unknown", "Multiple")) # Plot (Fig. 5F) ggplot(uniqueResults, aes(x = multiDriver, y = estimate)) + geom_boxplot(width = 0.3, aes(color = multiDriver), outlier.shape = NA) + geom_jitter(aes(x = multiDriver, y = estimate, color = multiDriver), width = .1) + scale_color_manual(values = c(colorPal[4], colorPal[8])) + theme_bw() + geom_label(label = paste0("p=", round(nonParamTest$p.value, 7)), x = 1.5, y = 2) + ylab("Net growth rate (r)") + xlab("") + ggtitle("Driver mutations") + theme( legend.position = "none", axis.ticks.x = element_blank(), axis.title.x = element_blank(), plot.title = element_text(hjust = .5) ) ## ----getMax------------------------------------------------------------------- # Get the clone IDs of the fittest clone in each patient cloneIDs_max <- sapply(unique(uniqueResults$patient), function(x) { patient.df <- uniqueResults[uniqueResults$patient == x, ] cloneID <- patient.df$cloneName_result[which.max(patient.df$estimate)] cloneID }) # Get the clone IDs of the youngest clone in each patient cloneIDs_youngest <- sapply(unique(uniqueResults$patient), function(x) { patient.df <- uniqueResults[uniqueResults$patient == x, ] cloneID <- patient.df$cloneName_result[which.min(patient.df$cloneAgeEstimate)] cloneID }) # See how much overlap there is between the most recent clone and the highest growth rate table(cloneIDs_max == cloneIDs_youngest) ## ----maxStratify, out.width = '50%'------------------------------------------- # Subset the results to only have the fittest clone for each patient maxResults <- uniqueResults[uniqueResults$cloneName_result %in% cloneIDs_max, ] # Subset to the MPN patients malMax <- maxResults[maxResults$malnorm == "Malignant", ] # Determine time from clone initiation to diagnosis malMax$latency_to_dx <- malMax$cloneAgeEstimate - (malMax$age - malMax$diagnosis.age) # Stratify the population by the mean growth rate, adding a column "aboveMean" malMax$aboveMean <- malMax$estimate > mean(malMax$estimate) # Use the survival package to see if differences are significant malMax$status <- 1 survivalTest <- survival::survdiff(survival::Surv(time = latency_to_dx, event = status) ~ aboveMean, data = malMax) print(survivalTest$pvalue) # Plot survival curves survfit2(Surv(time = latency_to_dx, event = status) ~ aboveMean, data = malMax) %>% ggsurvfit() + labs( y = "Probability diagnosis-free", x = "Time from clone initiation (yrs.)" ) + scale_color_manual(values = c(colorPal[7], colorPal[3])) + theme(legend.position = "none") + geom_label(label = paste0("p = ", round(survivalTest$pvalue, 4)), x = 45, y = .85, color = "black", fill = "white") + geom_label(label = "r < mean", x = 40, y = .375, color = colorPal[7], fill = "white") + geom_label(label = "r > mean", x = 12, y = .35, color = colorPal[3], fill = "white") ## ----youngestStratify--------------------------------------------------------- youngestResults <- uniqueResults[uniqueResults$cloneName_result %in% cloneIDs_youngest, ] # Subset to the MPN patients malYoungest <- youngestResults[youngestResults$malnorm == "Malignant", ] # Check how much overlap there is between youngest clone and fittest clone in the MPN dataset table(malYoungest$cloneName_result == malMax$cloneName_result) ## ----consistency, out.width = '50%'------------------------------------------- # Like before, we can find which clone IDs appear twice tmp <- table(results$uniqueCloneID) repeatsVec <- names(tmp)[tmp == 2] # Subset the results to include only repeated clones repeatResults <- results[results$uniqueCloneID %in% repeatsVec, ] # Plot the repeated clones ggplot(repeatResults) + geom_pointrange(data = repeatResults, aes( x = cloneName_result, y = estimate, ymin = lowerBound, ymax = upperBound, color = uniqueCloneID )) + scale_color_manual(values = colorPal[c(1, 6, 7)]) + theme_bw() + ylab("Net growth rate estimate (r)") + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), legend.title = element_blank()) ## ----loadLong, out.width = '50%'---------------------------------------------- # Get data from PD9478 PD9478_long <- longitudinalData[longitudinalData$Sample.ID == "PD9478", ] # Get a rough estimate for starting params startingParams <- suppressWarnings(coef(lm(logit(VAF / .5) ~ Age, data = PD9478_long))) # Fit to a three parameter logistic curve fit <- nls(VAF ~ K / (1 + exp(-(phi2 + r * Age))), start = list(K = 0.2, phi2 = min(c(-0.000001, startingParams[1])), r = max(c(0.00001, startingParams[2]))), data = PD9478_long, trace = FALSE, algorithm = "port", lower = c(0, -500, 0.00001), upper = c(0.5, 0, 5) ) # Set output equal to summary of longitudinal models output <- summary(fit) # Assign fitted params r <- output$coefficients["r", 1] t_m <- -output$coefficients["phi2", 1] / r # Get midpoint time by dividing by -1/r K <- output$coefficients["K", 1] # Get CI for growth rate of longitudinal/logistic model stdError <- output$coefficients["r", "Std. Error"] lb <- r - 1.96 * stdError ub <- r + 1.96 * stdError # Prepare a df for plotting the fit line along with the data x <- c((min(PD9478_long$Age) - 10):(max(PD9478_long$Age) + 10)) # construct a range of x values bounded by the data y <- K / (1 + exp(-(x - t_m) * r)) # curve VAF (3 param model) predict.df <- data.frame("x" = x, "y" = y) # Set color for longitudinal fit info fitColor <- colorPal[6] # Plot ggplot(PD9478_long, aes(x = Age, y = VAF)) + theme_bw() + coord_cartesian(xlim = c(min(x), max(x)), ylim = c(-0.01, 0.52), expand = 0) + labs(x = "Person Age (yr)", y = "Variant Allele Frequency (VAF)") + ggtitle(gsub("_", " & ", paste0(PD9478_long$Sample.ID[1], " ", PD9478_long$Gene[1]))) + geom_line(data = predict.df, aes(x = x, y = y), color = fitColor, linewidth = 1, show.legend = TRUE) + geom_point(color = "#808080", shape = 18, size = 1.5) + geom_vline(xintercept = t_m, linetype = "dotted", color = fitColor, linewidth = .6) + geom_hline(yintercept = K, linetype = "dotted", color = fitColor, linewidth = .6) ## ----validateLong, out.width = '50%'------------------------------------------ # Get the single cell estimate from PD9478 clone 1 scPD9478 <- results[results$cloneName_result == "PD9478_1_clone1", ] # Combine orthogonal estimates into one df for plotting combine.df <- data.frame( "Clone" = "PD9478_1_clone1", "r" = c(r, scPD9478$estimate), "lowerBound" = c(lb, scPD9478$lowerBound), "upperBound" = c(ub, scPD9478$upperBound), "method" = c("longitudinal", "max. likelihood") ) # Plot ggplot(combine.df) + geom_pointrange(aes(x = method, y = r, ymin = lowerBound, ymax = upperBound)) + theme_bw() + theme(axis.title.x = element_blank()) + ylab("Net growth rate (r)")