## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE
)
library(dplyr)

## -----------------------------------------------------------------------------
library(lares)

## -----------------------------------------------------------------------------
data(dft)
head(dft, 3)

## -----------------------------------------------------------------------------
# How many survived?
freqs(dft, Survived)

## -----------------------------------------------------------------------------
# Survival by passenger class
freqs(dft, Pclass, Survived)

## ----fig.width=7, fig.height=4------------------------------------------------
# Visualize survival by class
freqs(dft, Pclass, Survived, plot = TRUE)

## ----fig.width=7, fig.height=5------------------------------------------------
freqs_df(dft, plot = TRUE, top = 10)

## -----------------------------------------------------------------------------
# Correlation matrix of numeric variables
cors <- corr(dft[, 2:5], method = "pearson")
head(cors, 3)

## ----fig.width=7, fig.height=5------------------------------------------------
# Which variables correlate most with Survival?
corr_var(dft, Survived, top = 10)

## ----fig.width=7, fig.height=5------------------------------------------------
# Top cross-correlations
corr_cross(dft[, 2:6], top = 8)

## -----------------------------------------------------------------------------
# Reduce ticket categories (keep top 5, group rest as "other")
dft_reduced <- categ_reducer(dft, Ticket, top = 5)
freqs(dft_reduced, Ticket, top = 10)

## -----------------------------------------------------------------------------
# Normalize age
dft$Age_norm <- normalize(dft$Age)
head(dft[, c("Age", "Age_norm")], 5)

## -----------------------------------------------------------------------------
# One-hot encode passenger class
dft_encoded <- ohse(dft[, c("Pclass", "Survived")], limit = 5)
colnames(dft_encoded)

## -----------------------------------------------------------------------------
# Create sample dates
dates <- seq(as.Date("2024-01-01"), as.Date("2024-12-31"), by = "day")

# Extract year-month
ym <- year_month(dates[1:5])
ym

# Extract year-quarter
yq <- year_quarter(dates[1:5])
yq

# Cut dates into quarters
quarters <- date_cuts(dates[c(1, 100, 200, 300)], type = "Q")
quarters

## ----fig.width=7, fig.height=4------------------------------------------------
library(ggplot2)

ggplot(dft, aes(x = Age, y = Fare * 1000, color = Survived)) +
  geom_point(alpha = 0.6) +
  labs(title = "Age vs Fare by Survival") +
  # Customize theme with several available options
  theme_lares(legend = "top", grid = "Yy", pal = 2, background = "#f2f2f2") +
  # Customize axis scales to look nicer
  scale_y_abbr()

## ----fig.width=7, fig.height=5------------------------------------------------
# Analyze Fare distribution
distr(dft, Fare, breaks = 20)

## -----------------------------------------------------------------------------
# Format large numbers
formatNum(c(1234567, 987654.321), decimals = 2)

# Abbreviate numbers
num_abbr(c(1500, 2500000, 1.5e9))

# Convert abbreviations back to numbers
num_abbr(c("1.5K", "2.5M", "1.5B"), numeric = TRUE)

## ----fig.width=7, fig.height=4------------------------------------------------
df_summary <- dft %>%
  group_by(Pclass) %>%
  summarize(avg_fare = mean(Fare, na.rm = TRUE), .groups = "drop")

ggplot(df_summary, aes(x = factor(Pclass), y = avg_fare)) +
  geom_col(fill = "#00B1DA") +
  labs(title = "Average Fare by Class", x = "Class", y = NULL) +
  scale_y_dollar() + # Format as currency
  theme_lares()

## -----------------------------------------------------------------------------
# Simple comma-separated
vector2text(c("apple", "banana", "cherry"))

# With "and" before last item
vector2text(c("red", "green", "blue"), and = "and")

# Shorter alias
v2t(LETTERS[1:5])

## ----fig.width=7, fig.height=5------------------------------------------------
library(dplyr)

# 1. Load and prepare data
data(dft)

# 2. Clean and transform
dft_clean <- dft %>%
  mutate(Age_Group = cut(Age,
    breaks = c(0, 18, 35, 60, 100),
    labels = c("Child", "Young", "Adult", "Senior")
  ))

# 3. Analyze frequencies
freqs(dft_clean, Age_Group, Survived, plot = TRUE)

# 4. Check correlations
corr_var(dft_clean, Survived_TRUE, top = 8, max_pvalue = 0.05)