## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, fig.align = "center", warning = FALSE, message = FALSE ) ## ----load-package------------------------------------------------------------- library(evanverse) library(dplyr) ## ----void-concepts------------------------------------------------------------ # Examples of void values void_examples <- list( numbers = c(1, NA, 3, 4), strings = c("A", "", "C", NA), mixed = c("text", NA, "", "data") ) print("Examples of data with void values:") str(void_examples) ## ----void-detection-single---------------------------------------------------- # Check if individual values are void print(is_void(NA)) # TRUE print(is_void("")) # TRUE print(is_void(NULL)) # TRUE print(is_void("hello")) # FALSE print(is_void(0)) # FALSE ## ----void-detection-vector---------------------------------------------------- # Check if any element in a vector is void test_vector <- c("A", "", "C", NA, "E") print(any_void(test_vector)) # TRUE # Example with no void values clean_vector <- c("A", "B", "C") print(any_void(clean_vector)) # FALSE ## ----void-detection-dataframe------------------------------------------------- # Create sample data with various void patterns sample_data <- data.frame( id = 1:6, name = c("Alice", "", "Charlie", NA, "Eve", "Frank"), age = c(25, 30, NA, 35, 28, 32), city = c("NYC", "LA", "", "Chicago", NA, "Boston"), stringsAsFactors = FALSE ) print("Sample data with void values:") print(sample_data) # Identify columns with void values void_cols <- cols_with_void(sample_data) print(paste("Columns with void values:", paste(void_cols, collapse = ", "))) # Identify rows with void values void_rows <- rows_with_void(sample_data) print(paste("Rows with void values:", paste(void_rows, collapse = ", "))) ## ----void-replacement-basic--------------------------------------------------- # Replace all void values with a single replacement messy_vector <- c("A", "", "C", NA, "E") clean_vector <- replace_void(messy_vector, value = "MISSING") print("Original vector:") print(messy_vector) print("After replacement:") print(clean_vector) ## ----void-replacement-selective----------------------------------------------- # Replace only specific types of void values mixed_data <- c("A", "", "C", NA, "E") # Replace only empty strings only_empty <- replace_void(mixed_data, value = "EMPTY", include_na = FALSE, include_empty_str = TRUE) print("Replace only empty strings:") print(only_empty) # Replace only NA values only_na <- replace_void(mixed_data, value = "NOT_AVAILABLE", include_na = TRUE, include_empty_str = FALSE) print("Replace only NA values:") print(only_na) ## ----void-replacement-dataframe----------------------------------------------- # Apply replacement column by column clean_data <- sample_data clean_data$name <- replace_void(sample_data$name, value = "UNKNOWN") clean_data$city <- replace_void(sample_data$city, value = "UNKNOWN") print("Data after void replacement:") print(clean_data) ## ----drop-elements------------------------------------------------------------ # For vectors, drop_void removes void elements test_vector <- c("A", "", "C", NA, "E") clean_vector <- drop_void(test_vector) print("Original vector:") print(test_vector) print("After dropping void elements:") print(clean_vector) # For data analysis, we can identify problematic rows/columns print("Rows with void values:") print(rows_with_void(sample_data)) print("Columns with void values:") print(cols_with_void(sample_data)) ## ----df-to-list--------------------------------------------------------------- # Group data by a key column and create lists mtcars_subset <- mtcars[1:12, c("cyl", "mpg", "hp", "wt")] # Group by cylinder count, focusing on MPG values grouped_cars <- df2list( data = mtcars_subset, key_col = "cyl", value_col = "mpg" ) print("Cars grouped by cylinder count (MPG values):") str(grouped_cars) # Access specific groups print("4-cylinder cars MPG values:") print(grouped_cars[["4"]]) ## ----column-mapping----------------------------------------------------------- # Map values in a column using a named vector grades_data <- data.frame( student = c("Alice", "Bob", "Charlie", "Diana"), grade_letter = c("A", "B", "A", "C") ) # Create mapping for letter grades to numbers grade_mapping <- c("A" = 4.0, "B" = 3.0, "C" = 2.0, "D" = 1.0, "F" = 0.0) # Apply mapping using the correct parameters result <- map_column( query = grades_data, by = "grade_letter", map = grade_mapping, to = "grade_numeric" ) print("Grades with numeric mapping:") print(result) ## ----file-reading, eval=FALSE------------------------------------------------- # # Read various file formats with automatic detection # data1 <- read_table_flex("data.csv") # data2 <- read_table_flex("data.tsv", sep = "\t") # data3 <- read_table_flex("data.txt", header = TRUE) # # # Read Excel files with flexibility # excel_data <- read_excel_flex("workbook.xlsx", sheet = "Sheet1") ## ----file-info, eval=FALSE---------------------------------------------------- # # Get comprehensive file information # info <- file_info("myfile.csv") # print(info) # # # Extract file extensions # files <- c("data.csv", "analysis.R", "report.pdf") # extensions <- sapply(files, get_ext) # print(extensions) # # # Display directory structure # file_tree(".", max_depth = 2) ## ----string-operators--------------------------------------------------------- # Paste operator for clean string concatenation full_name <- "John" %p% " " %p% "Doe" print(full_name) file_path <- "data" %p% "/" %p% "analysis" %p% ".csv" print(file_path) ## ----logical-operators-------------------------------------------------------- # Enhanced "not in" operator fruits <- c("apple", "banana", "orange") check_fruits <- c("apple", "grape", "banana", "kiwi") # Find fruits not in our list missing_fruits <- check_fruits[check_fruits %nin% fruits] print(paste("Missing fruits:", paste(missing_fruits, collapse = ", "))) # Enhanced identity checking print(5 %is% 5) # TRUE print("a" %is% "a") # TRUE print(5 %is% "5") # FALSE ## ----combinatorial------------------------------------------------------------ # Generate combinations and permutations items <- c("A", "B", "C", "D") # Calculate combination numbers combinations_count <- comb(4, 2) # C(4,2) = 6 print(paste("Number of ways to choose 2 items from 4:", combinations_count)) # Calculate permutation numbers permutations_count <- perm(4, 2) # P(4,2) = 12 print(paste("Number of ways to arrange 2 items from 4:", permutations_count)) ## ----survey-example----------------------------------------------------------- # Simulate messy survey data survey_data <- data.frame( id = 1:8, age = c(25, "", 30, NA, "35", 28, 0, 45), income = c("50000", "", NA, "75000", "60000", "invalid", "80000", ""), satisfaction = c(5, 4, "", 3, NA, 5, 4, 2), stringsAsFactors = FALSE ) print("Original messy survey data:") print(survey_data) # Step 1: Identify problematic data cat("\nData quality assessment:\n") cat("Columns with void values:", paste(cols_with_void(survey_data), collapse = ", "), "\n") cat("Rows with void values:", paste(rows_with_void(survey_data), collapse = ", "), "\n") # Step 2: Clean the data # Replace void values with appropriate defaults survey_clean <- survey_data survey_clean$age <- replace_void(survey_clean$age, value = "25") survey_clean$income <- replace_void(survey_clean$income, value = "50000") survey_clean$satisfaction <- replace_void(survey_clean$satisfaction, value = 3) # Convert to appropriate types survey_clean$age <- as.numeric(survey_clean$age) survey_clean$income <- as.numeric(survey_clean$income) survey_clean$satisfaction <- as.numeric(survey_clean$satisfaction) # Handle special cases (e.g., age = 0, income = "invalid") survey_clean$age[survey_clean$age == 0] <- 25 survey_clean$income[is.na(survey_clean$income)] <- 50000 print("Cleaned survey data:") print(survey_clean) ## ----performance-tips--------------------------------------------------------- # For large datasets, check specific columns rather than entire data frame large_data <- data.frame( col1 = sample(c(1:100, NA), 1000, replace = TRUE), col2 = sample(c(letters, ""), 1000, replace = TRUE), col3 = runif(1000) ) # Check only columns likely to have voids critical_cols <- c("col1", "col2") void_status <- sapply(critical_cols, function(col) any_void(large_data[[col]])) print("Void status for critical columns:") print(void_status)