---
title: "Advanced Data Operations"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Advanced Data Operations}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE,
  error = TRUE
)
```

```{r setup}
library(clinpubr)
library(dplyr)
library(survival)
```

## Introduction

Clinical research often requires complex data manipulation: multi-source integration, fuzzy matching, temporal alignment, and format conversion. This vignette covers the advanced tools in `clinpubr` for these tasks:

1. **Cohort Screening** --- CONSORT-style exclusion tracking
2. **Data Merging** --- Substring and time-range matching
3. **Data Transformation** --- Long/wide format, multi-choice data
4. **Utility Functions** --- String manipulation, list operations
5. **Multi-Table Screening** --- `screen_data_list()` for complex cohort workflows

## Cohort Screening with Exclusion Tracking

`exclusion_count()` creates a CONSORT-style flow diagram, tracking how many patients are excluded at each stage:

```{r cohort-screening}
set.seed(123)

patients <- data.frame(
  patient_id = 1:100,
  age = round(rnorm(100, 60, 15)),
  gender = sample(c("M", "F"), 100, replace = TRUE),
  has_baseline_data = sample(c(TRUE, FALSE), 100, prob = c(0.9, 0.1), replace = TRUE),
  meets_inclusion = sample(c(TRUE, FALSE), 100, prob = c(0.8, 0.2), replace = TRUE),
  complete_followup = sample(c(TRUE, FALSE), 100, prob = c(0.85, 0.15), replace = TRUE)
)

exclusion_summary <- exclusion_count(
  patients,
  age < 50,
  gender != "M",
  !has_baseline_data,
  !meets_inclusion,
  !complete_followup
)

knitr::kable(exclusion_summary, caption = "CONSORT-Style Cohort Flow")

cat("Retention rate:", round(100 * exclusion_summary$N[nrow(exclusion_summary)] / exclusion_summary$N[1], 1), "%\n")
```

Customize exclusion criteria names:

```{r exclusion-custom}
exclusion_flow <- exclusion_count(
  patients,
  age < 18 | age > 80,
  !has_baseline_data,
  !meets_inclusion,
  .criteria_names = c(
    "Age outside 18-80 range",
    "Missing baseline data",
    "Does not meet inclusion criteria"
  )
)

knitr::kable(exclusion_flow, caption = "Custom Exclusion Flow")
```

## Data Merging Strategies

### Merge by Substring Matching

`merge_by_substring()` matches records when exact identifiers don't align (e.g., mapping free-text diagnoses to ICD codes):

```{r merge-substring}
medical_terms <- data.frame(
  match_term = c(
    "Type 2 Diabetes", "Hypertension", "Coronary Artery Disease",
    "Coronary Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  standard_term = c(
    "Type 2 Diabetes Mellitus", "Hypertension", "Coronary Artery Disease",
    "Coronary Artery Disease", "Chronic Kidney Disease", "Heart Failure", "Atrial Fibrillation"
  ),
  icd_code = c("E11", "I10", "I25", "I25", "N18", "I50", "I48"),
  category = c(
    "Endocrine", "Cardiovascular", "Cardiovascular", "Cardiovascular",
    "Renal", "Cardiovascular", "Cardiovascular"
  )
)

patient_diagnoses <- data.frame(
  patient_id = 1:10,
  diagnosis_text = c(
    "Severe Type 2 Diabetes", "Type 2 Diabetes Mellitus",
    "Type 2 Diabetes with Complications", "Essential Hypertension",
    "Hypertensive disease", "CAD - Coronary Artery Disease",
    "Coronary disease", "CKD Stage 3",
    "Congestive Heart Failure", "Heart failure chronic"
  )
)

merged_substring <- merge_by_substring(
  data = patient_diagnoses,
  key_df = medical_terms,
  search_col = "diagnosis_text",
  key_col = "match_term",
  value_cols = c("standard_term", "icd_code", "category")
)

knitr::kable(medical_terms, caption = "Medical Terms Table")
knitr::kable(merged_substring, caption = "Diagnoses with Mapped ICD Codes")
```

### Merge by Range (Time-Based Matching)

`merge_by_range()` matches events within specific time windows (e.g., lab results during hospitalization):

```{r merge-range}
patient_visits <- data.frame(
  patient_id = rep(1:3, each = 2),
  visit_id = 1:6,
  visit_start = as.Date(c(
    "2023-01-01", "2023-06-01", "2023-02-01",
    "2023-07-01", "2023-03-01", "2023-08-01"
  )),
  visit_end = as.Date(c(
    "2023-01-10", "2023-06-10", "2023-02-10",
    "2023-07-10", "2023-03-10", "2023-08-10"
  ))
)

lab_results <- data.frame(
  lab_id = 1:6,
  patient_id = c(1, 1, 2, 2, 3, 3),
  test_date = as.Date(c(
    "2023-01-05", "2023-06-05", "2023-02-03",
    "2023-07-08", "2023-03-15", "2023-08-05"
  )),
  test_name = c("Glucose", "HbA1c", "Glucose", "Creatinine", "Glucose", "Lipid panel"),
  result = round(rnorm(6, 100, 15), 1)
)

merged_range <- merge_by_range(
  x = patient_visits, y = lab_results,
  by = "patient_id",
  x_start = "visit_start", x_end = "visit_end",
  y_val = "test_date"
)

knitr::kable(merged_range, caption = "Labs Matched to Visit Windows")
```

## Data Transformation

### Long to Wide Format

`to_wide()` converts long-format data (one row per measurement) to wide format (one row per patient) to facilitate analysis:

```{r to-wide}
long_labs <- data.frame(
  patient_id = rep(1:5, each = 3),
  visit = rep(c(1, 2), times = c(8, 7)),
  test = rep(c("glucose", "creatinine", "cholesterol"), 5),
  value = round(rnorm(15, 100, 20), 1)
)

wide_labs <- to_wide(
  df = long_labs,
  keys = c("patient_id", "visit"),
  item_col = "test",
  value_col = "value"
)

knitr::kable(head(long_labs), caption = "Laboratory Data - Long Format")
knitr::kable(head(wide_labs), caption = "Laboratory Data - Wide Format")
```

### Multi-Choice Data

`split_multichoice()` splits comma-separated multi-choice columns into binary indicators; `combine_multichoice()` recombines them:

```{r multichoice}
set.seed(456)
survey_data <- data.frame(
  id = 1:20,
  symptoms = sapply(1:20, function(x) {
    paste(sample(
      c("fever", "cough", "headache", "fatigue"),
      sample(1:4, 1)
    ), collapse = ",")
  }),
  comorbidities = sapply(1:20, function(x) {
    paste(sample(
      c("diabetes", "hypertension"),
      sample(1:2, 1)
    ), collapse = ",")
  })
)

symptoms_split <- split_multichoice(
  survey_data,
  quest_cols = c("symptoms", "comorbidities"),
  split = ",",
  remove_space = FALSE
)

knitr::kable(head(survey_data), caption = "Multi-Choice Data")

knitr::kable(head(symptoms_split), caption = "Split Multi-Choice Data")

combined <- combine_multichoice(
  symptoms_split,
  quest_cols = list(
    respiratory = c("symptoms_cough", "symptoms_fatigue"),
    systemic = c("symptoms_fever", "symptoms_headache")
  ),
  sep = ","
)

knitr::kable(head(combined), caption = "Combined Symptom Groups")
```

## Utility Functions

### String Manipulation

`common_prefix()` extracts the common prefix from a character vector, could be used when processing hospital exports to identify files from the same patient or batch processing multi-site survey data by site prefix.

```{r string-utils-prefix}
file_names <- c("patient_001_lab.csv", "patient_001_visit.csv", "patient_001_demo.csv")
common_prefix(file_names)
```

`str_match_replace()` replaces matched patterns in strings with specified replacements, could be used when standardizing lab test names across hospitals or unifying option labels across translated questionnaires.

```{r string-utils-replace}
test_names <- c("Glucose_Fasting", "Glucose_Random", "Cholesterol_Total", "LDL_Calculated")
standardized <- str_match_replace(
  x = test_names,
  to_match = c("Glucose", "Cholesterol", "LDL"),
  to_replace = c("GLU", "CHOL", "LDL_CHOL")
)

knitr::kable(data.frame(Original = test_names, Standardized = standardized),
  caption = "Test Name Standardization"
)
```

### List and Data Frame Operations

`add_lists()` adds corresponding elements of two lists element-wise, could be used when aggregating disease incidence counts across time periods or summing response frequencies across survey sites.

```{r list-ops-add}
list1 <- list(diabetes = 10, hypertension = 20, asthma = 5)
list2 <- list(diabetes = 15, hypertension = 25, asthma = 8)
add_lists(list1, list2)
```

`merge_ordered_vectors()` merges multiple ordered vectors preserving unique elements and original ordering, could be used when combining inclusion criteria lists from multi-site studies or merging question orders across questionnaire versions.

```{r list-ops-merge}
sites <- list(
  c("Diabetes", "Hypertension", "Heart Failure", "CKD"),
  c("Hypertension", "COPD", "Diabetes", "Cancer"),
  c("CKD", "Diabetes", "Stroke", "Hypertension")
)
merge_ordered_vectors(sites)
```

`replace_elements()` replaces specified elements in a vector with new values, could be used when correcting data entry errors in clinical data or harmonizing option codes across survey interviewers.

```{r list-ops-replace}
sample_data <- data.frame(
  group = c("Cntrol", "Treetment", "Placebo", "Cntrol", "Treatment"),
  value = 1:5
)
recoded <- replace_elements(
  x = sample_data$group,
  from = c("Cntrol", "Treetment"),
  to = c("Control", "Treatment")
)
knitr::kable(data.frame(Original = sample_data$group, Recoded = recoded),
  caption = "Group Recoding"
)
```

`fill_with_last()` fills missing values with the last valid observation (LOCF), could be used when handling merged cells from Excel imports of EMR data or filling respondent demographics across repeated survey measures.

```{r list-ops-fill}
time_series <- c(120, NA, NA, 125, NA, 130, NA, NA)
knitr::kable(data.frame(Original = time_series, Filled = fill_with_last(time_series)),
  caption = "Last Observation Carried Forward"
)
```

## Multi-Table Cohort Screening

`screen_data_list()` filters and joins multiple tables based on clinical criteria, with full audit trails. This is essential for retrospective EHR studies.

### Scenario: Building a Diabetes Cohort

```{r workflow-setup}
# Patient demographics
patient <- data.frame(
  pid = 1:50,
  age = sample(25:75, 50, replace = TRUE),
  gender = sample(c("M", "F"), 50, replace = TRUE)
)

# Admission records
admission <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = 101:115,
  admit_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70)
)

# Diagnosis records
diagnosis <- data.frame(
  pid = c(1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  dx_day = c(1, 30, 5, 45, 10, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  icd = c("E11", "I10", "N18", "E11", "E11", "I25", "E11", "J18", "E11", "I10", "E11", "N18", "E11", "I50")
)

# Lab results
lab <- data.frame(
  pid = c(1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
  vid = c(101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115),
  lab_day = c(1, 30, 5, 45, 10, 60, 15, 20, 25, 35, 40, 50, 55, 65, 70),
  HbA1c = c(8.5, 7.2, 9.1, 7.8, 8.0, 6.9, 7.5, 8.2, 7.0, 9.5, 7.8, 8.1, 7.3, 8.8, 7.6)
)
```

### Screen and Filter

Select patients with diabetes (ICD: E11), keeping records from first diabetes diagnosis onward:

```{r workflow-screen}
screened_data <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = icd == "E11",
  anchor_level = "date",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "list",
  return_audit = TRUE
)

knitr::kable(screened_data$audit$entry_scope, caption = "Entry Stage Audit")
knitr::kable(screened_data$audit$anchor_scope, caption = "Anchor Stage Audit")
```

### Review Filtered Data

```{r workflow-review}
knitr::kable(head(screened_data$data$patient), caption = "Filtered Patients")
knitr::kable(screened_data$data$diagnosis, caption = "Filtered Diagnoses")
knitr::kable(screened_data$data$lab, caption = "Filtered Labs")
```

### Joined Output

Alternatively, get a single joined data frame:

```{r workflow-joined}
joined_result <- screen_data_list(
  data_list = list(
    patient = patient, admission = admission,
    diagnosis = diagnosis, lab = lab
  ),
  entry_expr = any(icd == "E11"),
  entry_level = "patient_id",
  anchor_expr = any(icd == "E11"),
  anchor_level = "date",
  anchor_window = "from_first_anchor",
  patient_id_map = "pid",
  visit_id_map = c(admission = "vid", diagnosis = "vid", lab = "vid"),
  date_map = c(admission = "admit_day", diagnosis = "dx_day", lab = "lab_day"),
  output = "joined"
)

cat("Joined data:", nrow(joined_result), "rows,", ncol(joined_result), "columns\n")
knitr::kable(head(joined_result), caption = "Joined Output")
```

## Summary

| Function | Clinical Application |
|----------|---------------------|
| `exclusion_count()` | CONSORT-style cohort flow documentation |
| `screen_data_list()` | Multi-table cohort screening with audit trails |
| `merge_by_substring()` | Substring matching for diagnosis/procedure coding |
| `merge_by_range()` | Time-window matching for longitudinal data |
| `to_wide()` | Long-to-wide format conversion |
| `split_multichoice()` / `combine_multichoice()` | Survey and symptom data processing |
| Utility functions | String standardization, list operations, NA filling |