--- title: "Getting Started with staRburst" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Getting Started with staRburst} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ``` # Introduction staRburst makes it trivial to scale your parallel R code from your laptop to 100+ AWS workers. This vignette walks through setup and common usage patterns. ## Installation ```{r} # Install from GitHub remotes::install_github("yourname/starburst") ``` ## One-Time Setup Before using staRburst, you need to configure AWS resources. This only needs to be done once. ```{r} library(starburst) # Interactive setup wizard (takes ~2 minutes) starburst_setup() ``` This will: - Validate your AWS credentials - Create an S3 bucket for data transfer - Create an ECR repository for Docker images - Set up ECS cluster and VPC resources - Check Fargate quotas and offer to request increases ## Basic Usage The simplest way to use staRburst is with the `furrr` package: ```{r} library(furrr) library(starburst) # Define your work expensive_simulation <- function(i) { # Some computation that takes a few minutes results <- replicate(1000, { x <- rnorm(10000) mean(x^2) }) mean(results) } # Local execution (single core) plan(sequential) system.time({ results_local <- future_map(1:100, expensive_simulation) }) #> ~16 minutes on typical laptop # Cloud execution (50 workers) plan(future_starburst, workers = 50) system.time({ results_cloud <- future_map(1:100, expensive_simulation) }) #> ~2 minutes (including 45s startup) #> Cost: ~$0.85 # Results are identical identical(results_local, results_cloud) #> [1] TRUE ``` ## Example 1: Monte Carlo Simulation ```{r} library(starburst) library(furrr) # Simulate portfolio returns simulate_portfolio <- function(seed) { set.seed(seed) # Random walk for 252 trading days returns <- rnorm(252, mean = 0.0003, sd = 0.02) prices <- cumprod(1 + returns) list( final_value = prices[252], max_drawdown = max(cummax(prices) - prices) / max(prices), sharpe_ratio = mean(returns) / sd(returns) * sqrt(252) ) } # Run 10,000 simulations on 100 workers plan(future_starburst, workers = 100) results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE)) # Analyze results final_values <- sapply(results, `[[`, "final_value") hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values") # 95% confidence interval quantile(final_values, c(0.025, 0.975)) ``` **Performance**: - Local (single core): ~4 hours - Cloud (100 workers): ~3 minutes - Cost: ~$1.80 ## Example 2: Bootstrap Resampling ```{r} library(starburst) library(furrr) # Your data data <- read.csv("my_data.csv") # Bootstrap function bootstrap_regression <- function(i, data) { # Resample with replacement boot_indices <- sample(nrow(data), replace = TRUE) boot_data <- data[boot_indices, ] # Fit model model <- lm(y ~ x1 + x2 + x3, data = boot_data) # Return coefficients coef(model) } # Run 10,000 bootstrap samples plan(future_starburst, workers = 50) boot_results <- future_map(1:10000, bootstrap_regression, data = data) # Convert to matrix boot_coefs <- do.call(rbind, boot_results) # 95% confidence intervals for each coefficient apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975)) ``` ## Example 3: Genomics Pipeline ```{r} library(starburst) library(furrr) # Process one sample process_sample <- function(sample_id) { # Read from S3 (data already in cloud) fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id) data <- read_fastq(fastq_path) # Align reads aligned <- align_reads(data, reference = "hg38") # Call variants variants <- call_variants(aligned) # Return summary list( sample_id = sample_id, num_variants = nrow(variants), variants = variants ) } # Process 1000 samples on 100 workers sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$") plan(future_starburst, workers = 100) results <- future_map(sample_ids, process_sample, .progress = TRUE) # Combine results all_variants <- do.call(rbind, lapply(results, `[[`, "variants")) ``` **Performance**: - Local (sequential): ~208 hours (8.7 days) - Cloud (100 workers): ~2 hours - Cost: ~$47 ## Working with Data ### Data Already in S3 If your data is already in S3, workers can read it directly: ```{r} plan(future_starburst, workers = 50) results <- future_map(file_list, function(file) { # Workers read directly from S3 data <- read.csv(sprintf("s3://my-bucket/%s", file)) process(data) }) ``` ### Uploading Local Data For smaller datasets, you can pass data as arguments: ```{r} # Load data locally data <- read.csv("local_file.csv") # staRburst automatically uploads to S3 and distributes plan(future_starburst, workers = 50) results <- future_map(1:1000, function(i) { # Each worker gets a copy of 'data' bootstrap_analysis(data, i) }) ``` ### Large Data Optimization For very large objects, pre-upload to S3: ```{r} # Upload once large_data <- read.csv("huge_file.csv") s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds") # Workers read from S3 plan(future_starburst, workers = 100) results <- future_map(1:1000, function(i) { # Read from S3 inside worker data <- readRDS(s3_path) process(data, i) }) ``` ## Cost Management ### Estimate Costs ```{r} # Check cost before running plan(future_starburst, workers = 100, cpu = 4, memory = "8GB") #> Estimated cost: ~$3.50/hour ``` ### Set Cost Limits ```{r} # Set maximum cost per job starburst_config( max_cost_per_job = 10, # Don't start jobs that would cost >$10 cost_alert_threshold = 5 # Warn when approaching $5 ) # Now jobs exceeding limit will error before starting plan(future_starburst, workers = 1000) # Would cost ~$35/hour #> Error: Estimated cost ($35/hr) exceeds limit ($10/hr) ``` ### Track Actual Costs ```{r} plan(future_starburst, workers = 50) results <- future_map(data, process) #> Cluster runtime: 23 minutes #> Total cost: $1.34 ``` ## Quota Management ### Check Your Quota ```{r} starburst_quota_status() #> Fargate vCPU Quota: 100 / 100 used #> Allows: ~25 workers with 4 vCPUs each #> #> Recommended: Request increase to 500 vCPUs ``` ### Request Quota Increase ```{r} starburst_request_quota_increase(vcpus = 500) #> Requesting Fargate vCPU quota increase: #> Current: 100 vCPUs #> Requested: 500 vCPUs #> #> ✓ Quota increase requested (Case ID: 12345678) #> ✓ AWS typically approves within 1-24 hours ``` ### Wave-Based Execution If you request more workers than your quota allows, staRburst automatically uses wave-based execution: ```{r} # Quota allows 25 workers, but you request 100 plan(future_starburst, workers = 100, cpu = 4) #> ⚠ Requested: 100 workers (400 vCPUs) #> ⚠ Current quota: 100 vCPUs (allows 25 workers max) #> #> 📋 Execution plan: #> • Running in 4 waves of 25 workers each #> #> 💡 Request quota increase to 500 vCPUs? [y/n]: y #> #> ✓ Quota increase requested #> ⚡ Starting wave 1 (25 workers)... results <- future_map(1:1000, expensive_function) #> ⚡ Wave 1: 100% complete (250 tasks) #> ⚡ Wave 2: 100% complete (500 tasks) #> ⚡ Wave 3: 100% complete (750 tasks) #> ⚡ Wave 4: 100% complete (1000 tasks) ``` ## Troubleshooting ### View Worker Logs ```{r} # View logs from most recent cluster starburst_logs() # View logs from specific task starburst_logs(task_id = "abc-123") # View last 100 log lines starburst_logs(last_n = 100) ``` ### Check Cluster Status ```{r} starburst_status() #> Active Clusters: #> • starburst-xyz123: 50 workers running #> • starburst-abc456: 25 workers running ``` ### Common Issues **Environment mismatch**: Packages not found on workers ```{r} # Rebuild environment starburst_rebuild_environment() ``` **Task failures**: Some tasks failing ```{r} # Check logs starburst_logs(task_id = "failed-task-id") # Often due to memory limits - increase worker memory plan(future_starburst, workers = 50, memory = "16GB") # Default is 8GB ``` **Slow data transfer**: Large objects taking too long ```{r} # Use Arrow for data frames library(arrow) write_parquet(my_data, "s3://bucket/data.parquet") # Workers read Arrow results <- future_map(1:100, function(i) { data <- read_parquet("s3://bucket/data.parquet") process(data, i) }) ``` ## Best Practices ### 1. Use for Right-Sized Workloads ✅ **Good**: Each task takes >5 minutes ```{r} # 100 tasks, each takes 10 minutes # Local: 1000 minutes, Cloud: ~10 minutes ``` ❌ **Bad**: Each task takes <1 minute ```{r} # 10000 tasks, each takes 30 seconds # Startup overhead (45s) dominates ``` ### 2. Batch Small Tasks Instead of: ```{r} # 10,000 tiny tasks results <- future_map(1:10000, small_function) ``` Do: ```{r} # 100 batches of 100 tasks each batches <- split(1:10000, ceiling(seq_along(1:10000) / 100)) results <- future_map(batches, function(batch) { lapply(batch, small_function) }) # Flatten results results <- unlist(results, recursive = FALSE) ``` ### 3. Use S3 for Large Data Don't: ```{r} big_data <- read.csv("10GB_file.csv") # Upload for every task results <- future_map(1:1000, function(i) process(big_data, i)) ``` Do: ```{r} # Upload once to S3 s3_path <- "s3://bucket/big_data.csv" write.csv(big_data, s3_path) # Workers read from S3 results <- future_map(1:1000, function(i) { data <- read.csv(s3_path) process(data, i) }) ``` ### 4. Set Reasonable Limits ```{r} starburst_config( max_cost_per_job = 50, # Prevent accidents cost_alert_threshold = 25 # Get warned early ) ``` ### 5. Clean Up ```{r} # staRburst auto-cleans, but you can force it plan(sequential) # Switch back to local # Old cluster resources are cleaned up automatically ``` ## Advanced: Custom Configuration ### CPU and Memory ```{r} # High CPU, low memory (CPU-bound work) plan(future_starburst, workers = 50, cpu = 8, memory = "16GB") # Low CPU, high memory (memory-bound work) plan(future_starburst, workers = 25, cpu = 4, memory = "32GB") ``` ### Timeout ```{r} # Increase timeout for long-running tasks (default 1 hour) plan(future_starburst, workers = 10, timeout = 7200) # 2 hours ``` ### Region ```{r} # Use specific region (default from config) plan(future_starburst, workers = 50, region = "us-west-2") ``` ## Next Steps - Learn about [Detached Sessions](detached-sessions.html) for long-running jobs - Explore [Example Vignettes](https://starburst.ing/articles/) for real-world patterns - Review [Security Best Practices](security.html) guide - Read [Troubleshooting Guide](troubleshooting.html) when stuck ## Getting Help - GitHub Issues: https://github.com/scttfrdmn/starburst/issues - Email: help@starburst.ing - Documentation: https://starburst.ing