## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE, error = TRUE ) ## ----setup-------------------------------------------------------------------- library(clinpubr) library(dplyr) ## ----create-messy-data-------------------------------------------------------- set.seed(123) n <- 100 # Patient IDs patient_ids <- paste0("P", sprintf("%03d", 1:n)) # Glucose: mix of mg/dL and mmol/L (1 mmol/L = 18 mg/dL) # Also include some outliers and missing values glucose_vals <- c( rnorm(35, mean = 100, sd = 15), # mg/dL normal range rnorm(30, mean = 5.5, sd = 0.8), # mmol/L normal range rnorm(10, mean = 200, sd = 20), # mg/dL high (outliers) rnorm(10, mean = 15, sd = 2), # mmol/L high (outliers) rep(NA, 10), # missing values 999, 888, 0, -50, 500 # erroneous values ) glucose_vals <- round(sample(glucose_vals, n), 1) glucose_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1)) # Creatinine: mix of mg/dL and umol/L (1 mg/dL = 88.4 umol/L) creatinine_vals <- c( rnorm(35, mean = 1.0, sd = 0.2), # mg/dL normal range rnorm(30, mean = 88, sd = 15), # umol/L normal range rnorm(10, mean = 3.0, sd = 0.5), # mg/dL high (outliers) rnorm(10, mean = 265, sd = 44), # umol/L high (outliers) rep(NA, 10), # missing values 999, 0, -10, 1000, 0.001 # erroneous values ) creatinine_vals <- round(sample(creatinine_vals, n), 2) creatinine_units <- sample(c("mg/dL", "umol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1)) # Cholesterol: mix of mg/dL and mmol/L (1 mmol/L = 38.67 mg/dL) cholesterol_vals <- c( rnorm(35, mean = 180, sd = 30), # mg/dL normal range rnorm(30, mean = 4.5, sd = 0.8), # mmol/L normal range rnorm(10, mean = 350, sd = 40), # mg/dL high (outliers) rnorm(10, mean = 9, sd = 1), # mmol/L high (outliers) rep(NA, 10), # missing values 888, 999, 50, 600, 0 # erroneous values ) cholesterol_vals <- round(sample(cholesterol_vals, n), 2) cholesterol_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1)) # Test dates with various formats and impossible dates test_date_vals <- sample(c( format(Sys.Date() - sample(1:365, 60, replace = TRUE), "%Y-%m-%d"), # ISO format format(Sys.Date() - sample(1:365, 30, replace = TRUE), "%Y/%m/%d"), # Slash format "1900-01-01", "2030-12-31", "N/A", "", "unknown", "pending", # Invalid dates rep(NA, 10) ), n * 3, replace = TRUE) # Create long-format lab data with messy values lab_data <- data.frame( patient_id = rep(patient_ids, 3), test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = n), value = c(glucose_vals, creatinine_vals, cholesterol_vals), unit = c(glucose_units, creatinine_units, cholesterol_units), test_date = test_date_vals ) # Add messy string values that need cleaning # Glucose with text annotations and European decimal commas messy_glucose_idx <- sample(which(lab_data$test == "Glucose"), 15) lab_data$value[messy_glucose_idx] <- sample(c( "<40", ">500", "5.2", "6.8", "120 mg/dL", "89 mmol/L", "5.2 (fasting)", "180 (post-meal)", "N/A", "pending", "6..5", "7..2", "8.5.1", "normal range", "see comment" ), length(messy_glucose_idx), replace = TRUE) # Creatinine with text annotations messy_creatinine_idx <- sample(which(lab_data$test == "Creatinine"), 12) lab_data$value[messy_creatinine_idx] <- sample(c( "<0.5", ">5.0", "1.1", "0.9", "1.2 mg/dL", "110 umol/L", "1.5 (dialysis)", "2.8 (critical)", "N/A", " hemolyzed", "1.3 ", " 1.4", "(1.2)", "1.0*", "see note" ), length(messy_creatinine_idx), replace = TRUE) # Cholesterol with text annotations messy_cholesterol_idx <- sample(which(lab_data$test == "Cholesterol"), 10) lab_data$value[messy_cholesterol_idx] <- sample(c( "<100", ">400", "4.5", "5.2", "200 mg/dL", "5.5 mmol/L", "180 (fasting)", "250 (borderline)", "N/A", "lipemic" ), length(messy_cholesterol_idx), replace = TRUE) knitr::kable(head(lab_data[sample(nrow(lab_data)), ], 10), caption = "Original Messy Lab Data (10 random rows)" ) ## ----data-overview------------------------------------------------------------ overview <- data_overview(lab_data) print(overview$variable_types) print(overview$summary_stats) print(overview$quality_issues$missing_values) ## ----check-units-------------------------------------------------------------- knitr::kable(unit_view(lab_data, subject_col = "test", value_col = "value", unit_col = "unit"), caption = "Unit Conflicts by Test Type" ) ## ----check-nonnum------------------------------------------------------------- nonnum_df <- df_view_nonnum(lab_data) knitr::kable(head(nonnum_df, 15), caption = "Non-numeric Entries by Variable") ## ----extract-numeric---------------------------------------------------------- # Create a copy for cleaning lab_data_cleaned <- lab_data # Extract numeric values from the messy value column lab_data_cleaned$value_numeric <- extract_num(lab_data$value) # Show before/after comparison comparison <- data.frame( test = lab_data$test, original = lab_data$value, cleaned = lab_data_cleaned$value_numeric, unit = lab_data$unit ) %>% dplyr::filter(original != cleaned | is.na(cleaned)) %>% head(15) knitr::kable(comparison, caption = "Value Cleaning: Before vs After") ## ----date-conversion---------------------------------------------------------- # Convert various date formats to standard Date objects lab_data_cleaned$test_date_clean <- to_date(lab_data$test_date) # Show date conversion results date_comparison <- data.frame( original = lab_data$test_date, cleaned = lab_data_cleaned$test_date_clean ) %>% dplyr::filter(!is.na(original)) %>% head(15) knitr::kable(date_comparison, caption = "Date Conversion: Before vs After") ## ----check-invalid-dates------------------------------------------------------ invalid_dates <- lab_data_cleaned %>% dplyr::filter(!is.na(test_date) & is.na(test_date_clean)) %>% dplyr::select(patient_id, test, test_date) %>% head(10) knitr::kable(invalid_dates, caption = "Invalid Date Entries (Could Not Be Parsed)") ## ----update-cleaned-data------------------------------------------------------ lab_data <- lab_data_cleaned %>% dplyr::mutate( value = value_numeric, test_date = test_date_clean ) %>% dplyr::select(-value_numeric, -test_date_clean) ## ----unit-standardize-apply--------------------------------------------------- change_rules <- list( list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18), list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4), list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67) ) lab_data_std <- unit_standardize( lab_data, subject_col = "test", value_col = "value", unit_col = "unit", change_rules = change_rules ) knitr::kable(head(lab_data_std, 15), caption = "Lab Data After Unit Standardization") ## ----verify-standardization--------------------------------------------------- knitr::kable(unit_view(lab_data_std, subject_col = "test", value_col = "value", unit_col = "unit"), caption = "Units After Standardization" ) ## ----outlier-detection-------------------------------------------------------- # Split data by test type for outlier detection lab_data_clean <- lab_data_std # Detect outliers for each test for (test_name in c("Glucose", "Creatinine", "Cholesterol")) { test_data <- lab_data_std$value[lab_data_std$test == test_name] outlier_res <- detect_outliers(test_data, method = "iqr") cat("\n", test_name, ":\n") cat(" Total values:", length(test_data), "\n") cat(" Outliers detected:", sum(outlier_res$outlier_mask, na.rm = TRUE), "\n") cat(" Missing values:", sum(is.na(test_data)), "\n") } ## ----compare-methods---------------------------------------------------------- glucose_values <- lab_data_std$value[lab_data_std$test == "Glucose"] knitr::kable(data.frame( Method = c("MAD", "IQR", "Z-score"), Outlier_Count = c( sum(mad_outlier(glucose_values, threshold = 3), na.rm = TRUE), sum(iqr_outlier(glucose_values, threshold = 1.5), na.rm = TRUE), sum(zscore_outlier(glucose_values, threshold = 3), na.rm = TRUE) ) ), caption = "Outlier Detection Results by Method (Glucose)") ## ----handle-outliers---------------------------------------------------------- # Create a copy for outlier handling lab_data_final <- lab_data_std # Set outliers to NA for each test type for (test_name in c("Glucose", "Creatinine", "Cholesterol")) { test_idx <- lab_data_std$test == test_name test_values <- lab_data_std$value[test_idx] outlier_res <- detect_outliers(test_values, method = "iqr") # Set outliers to NA lab_data_final$value[test_idx][outlier_res$outlier_mask] <- NA } # Compare before and after outlier handling comparison_df <- data.frame( Test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = 2), Stage = rep(c("Before", "After"), 3), N = c( sum(!is.na(lab_data_std$value[lab_data_std$test == "Glucose"])), sum(!is.na(lab_data_final$value[lab_data_final$test == "Glucose"])), sum(!is.na(lab_data_std$value[lab_data_std$test == "Creatinine"])), sum(!is.na(lab_data_final$value[lab_data_final$test == "Creatinine"])), sum(!is.na(lab_data_std$value[lab_data_std$test == "Cholesterol"])), sum(!is.na(lab_data_final$value[lab_data_final$test == "Cholesterol"])) ), Mean = c( mean(lab_data_std$value[lab_data_std$test == "Glucose"], na.rm = TRUE), mean(lab_data_final$value[lab_data_final$test == "Glucose"], na.rm = TRUE), mean(lab_data_std$value[lab_data_std$test == "Creatinine"], na.rm = TRUE), mean(lab_data_final$value[lab_data_final$test == "Creatinine"], na.rm = TRUE), mean(lab_data_std$value[lab_data_std$test == "Cholesterol"], na.rm = TRUE), mean(lab_data_final$value[lab_data_final$test == "Cholesterol"], na.rm = TRUE) ) ) knitr::kable(comparison_df, caption = "Before vs After Outlier Handling", digits = 2) ## ----wide-format-------------------------------------------------------------- # Convert to wide format preserving all test dates # This creates a longitudinal cohort dataset lab_wide <- lab_data_final %>% filter(!is.na(test_date)) %>% # Remove records without valid dates select(patient_id, test_date, test, value) %>% tidyr::pivot_wider( names_from = test, values_from = value, names_sort = TRUE ) # Sort by patient and date for longitudinal view lab_wide <- lab_wide %>% arrange(patient_id, test_date) knitr::kable(head(lab_wide, 10), caption = "Longitudinal Lab Data (First 10 records)") ## ----cohort-structure--------------------------------------------------------- cohort_summary <- lab_wide %>% group_by(patient_id) %>% summarise( n_visits = n(), first_date = min(test_date), last_date = max(test_date) ) %>% head(10) knitr::kable(cohort_summary, caption = "Cohort Structure: Visits per Patient") ## ----missing-patterns--------------------------------------------------------- missing_summary <- data.frame( Variable = names(lab_wide)[-1], Missing_Count = sapply(lab_wide[-1], function(x) sum(is.na(x))), Missing_Percent = round(sapply(lab_wide[-1], function(x) sum(is.na(x)) / length(x) * 100, 2)) ) knitr::kable(missing_summary, caption = "Missing Data Summary by Test") ## ----filter-missing----------------------------------------------------------- lab_wide_clean <- get_valid_subset( lab_wide, row_na_ratio = 0.5, # Allow up to 50% missing per record col_na_ratio = 0.3 # Allow up to 30% missing per test type ) cat( "Dimensions:", nrow(lab_wide), "records", "->", nrow(lab_wide_clean), "records\n" ) cat( "Unique patients:", length(unique(lab_wide$patient_id)), "->", length(unique(lab_wide_clean$patient_id)), "\n" ) ## ----clinical-categories------------------------------------------------------ # Glucose categories lab_wide_clean$glucose_category <- cut_by( lab_wide_clean$Glucose, breaks = c(100, 126), labels = c("Normal", "Prediabetes", "Diabetes"), label_with_range = FALSE ) # Creatinine categories (eGFR approximation) lab_wide_clean$renal_function <- cut_by( lab_wide_clean$Creatinine, breaks = c(1.2, 2.0, 4.0), labels = c("Normal", "Mild", "Moderate", "Severe"), label_with_range = FALSE ) # Cholesterol categories lab_wide_clean$cholesterol_category <- cut_by( lab_wide_clean$Cholesterol, breaks = c(200, 240), labels = c("Desirable", "Borderline", "High"), label_with_range = FALSE ) # Summary of categories across all visits cat_summary <- data.frame( Category = c( rep("Glucose", length(table(lab_wide_clean$glucose_category))), rep("Renal Function", length(table(lab_wide_clean$renal_function))), rep("Cholesterol", length(table(lab_wide_clean$cholesterol_category))) ), Level = c( names(table(lab_wide_clean$glucose_category)), names(table(lab_wide_clean$renal_function)), names(table(lab_wide_clean$cholesterol_category)) ), Count = c( as.vector(table(lab_wide_clean$glucose_category)), as.vector(table(lab_wide_clean$renal_function)), as.vector(table(lab_wide_clean$cholesterol_category)) ) ) knitr::kable(cat_summary, caption = "Clinical Category Distributions (All Visits)") ## ----longitudinal-view-------------------------------------------------------- # Select one patient with multiple visits for demonstration patient_trajectory <- lab_wide_clean %>% filter(patient_id == unique(patient_id)[1]) %>% select( patient_id, test_date, Glucose, Creatinine, Cholesterol, glucose_category, renal_function ) %>% arrange(test_date) knitr::kable(patient_trajectory, caption = "Example: Single Patient's Lab Trajectory") ## ----complete-pipeline-------------------------------------------------------- # Step 1: Data Overview overview <- data_overview(lab_data) # Step 2: Format Cleaning - Extract numeric values and convert dates clean <- lab_data clean$value <- extract_num(lab_data$value) clean$test_date <- to_date(lab_data$test_date) # Step 3: Unit Standardization change_rules <- list( list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18), list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4), list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67) ) clean <- unit_standardize( clean, subject_col = "test", value_col = "value", unit_col = "unit", change_rules = change_rules ) # Step 4: Outlier Detection and Handling for (test_name in c("Glucose", "Creatinine", "Cholesterol")) { test_idx <- clean$test == test_name test_values <- clean$value[test_idx] outlier_res <- detect_outliers(test_values, method = "iqr") clean$value[test_idx][outlier_res$outlier_mask] <- NA } # Step 5: Convert to wide format (longitudinal cohort data) clean_wide <- clean %>% filter(!is.na(test_date)) %>% select(patient_id, test_date, test, value) %>% tidyr::pivot_wider(names_from = test, values_from = value) %>% arrange(patient_id, test_date) # Step 6: Missing value filtering clean_wide <- get_valid_subset(clean_wide, row_na_ratio = 0.5, col_na_ratio = 0.3) # Step 7: Create clinical categories clean_wide$glucose_category <- cut_by( clean_wide$Glucose, breaks = c(100, 126), labels = c("Normal", "Prediabetes", "Diabetes") ) # Step 8: Final check final_overview <- data_overview(clean_wide) knitr::kable(final_overview$summary_stats, caption = "Final Data Quality Overview") cat( "\nOriginal records:", nrow(lab_data), "| Final records:", nrow(clean_wide), "| Removed:", nrow(lab_data) - nrow(clean_wide), "\n" ) cat( "Unique patients:", length(unique(lab_data$patient_id)), "->", length(unique(clean_wide$patient_id)), "\n" )