## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE,
  eval = TRUE, # Show actual results like old vignette
  cache = FALSE, # Disabled for CRAN to avoid massive tarball sizes
  cache.lazy = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
library(lares)
library(dplyr)

## -----------------------------------------------------------------------------
# Install h2o (run once)
# install.packages("h2o")
library(h2o)

# Initialize h2o quietly for vignette
Sys.unsetenv("http_proxy")
Sys.unsetenv("https_proxy")
h2o.init(nthreads = -1, max_mem_size = "2G", ip = "127.0.0.1")
h2o.no_progress() # Disable progress bars

## -----------------------------------------------------------------------------
data(dft)

# Train an AutoML model
# Binary classification
model <- h2o_automl(
  df = dft,
  y = "Survived",
  target = "TRUE",
  ignore = c("Ticket", "Cabin", "PassengerId"),
  max_models = 10,
  max_time = 120,
  impute = FALSE
)

# View results
print(model)

## -----------------------------------------------------------------------------
names(model)

## -----------------------------------------------------------------------------
# All metrics
model$metrics

# Specific metrics
model$metrics$AUC
model$metrics$Accuracy
model$metrics$Logloss

## -----------------------------------------------------------------------------
# Confusion matrix plot
mplot_conf(
  tag = model$scores_test$tag,
  score = model$scores_test$score,
  subtitle = sprintf("AUC: %.3f", model$metrics$metrics$AUC)
)

## -----------------------------------------------------------------------------
# ROC curve
mplot_roc(
  tag = model$scores_test$tag,
  score = model$scores_test$score
)

## -----------------------------------------------------------------------------
# Gain and Lift charts for binary classification
mplot_gain(
  tag = model$scores_test$tag,
  score = model$scores_test$score
)

## -----------------------------------------------------------------------------
# Variable importance dataframe
head(model$importance, 15)

# Plot top 15 important variables
top15 <- head(model$importance, 15)
mplot_importance(
  var = top15$variable,
  imp = top15$importance
)

## ----eval=FALSE---------------------------------------------------------------
# # Calculate SHAP values (computationally expensive)
# shap <- h2o_shap(model)
# 
# # Plot SHAP summary
# plot(shap)

## -----------------------------------------------------------------------------
model <- h2o_automl(
  df = dft,
  y = "Survived",
  # Ignore specific columns
  ignore = c("Ticket", "Cabin", "PassengerId"),
  # Use only specific algorithms (exclude_algos also available)
  include_algos = c("GBM", "DRF"), # Gradient Boosting & Random Forest
  # Data split
  split = 0.7,
  # Handle imbalanced data
  balance = TRUE,
  # Remove outliers (Z-score > 3)
  no_outliers = TRUE,
  # Impute missing values (requires mice package if TRUE)
  impute = FALSE,
  # Keep only unique training rows
  unique_train = TRUE,
  # Reproducible results
  seed = 123
)

## -----------------------------------------------------------------------------
model_multiclass <- h2o_automl(
  df = dft,
  y = "Pclass",
  ignore = c("Cabin", "PassengerId"),
  max_models = 10,
  max_time = 60
)

# Multi-class metrics
model_multiclass$metrics

# Confusion matrix for multi-class
mplot_conf(
  tag = model_multiclass$scores_test$tag,
  score = model_multiclass$scores_test$score
)

## -----------------------------------------------------------------------------
model_regression <- h2o_automl(
  df = dft,
  y = "Fare",
  ignore = c("Cabin", "PassengerId"),
  max_models = 10,
  exclude_algos = NULL
)

# Regression metrics
model_regression$metrics

## -----------------------------------------------------------------------------
# Create splits
splits <- msplit(dft, size = 0.8, seed = 123)
splits$train$split <- "train"
splits$test$split <- "test"

# Combine
df_split <- rbind(splits$train, splits$test)

# Train using split column
model <- h2o_automl(
  df = df_split,
  y = "Survived",
  train_test = "split",
  max_models = 5
)

## -----------------------------------------------------------------------------
# New data (same structure as training)
new_data <- dft[1:10, ]

# Predict
predictions <- h2o_predict_model(new_data, model$model)
head(predictions)

## -----------------------------------------------------------------------------
# Get probabilities
predictions <- h2o_predict_model(new_data, model$model)
head(predictions)

## -----------------------------------------------------------------------------
# Complete model evaluation plots
mplot_full(
  tag = model$scores_test$tag,
  score = model$scores_test$score,
  subtitle = model$model@algorithm
)

## -----------------------------------------------------------------------------
# Model performance over trees
mplot_metrics(model)

## -----------------------------------------------------------------------------
# Save model and plots
export_results(model, subdir = "models", thresh = 0.5)

## ----eval=FALSE---------------------------------------------------------------
# # Load model
# loaded_model <- readRDS("models/Titanic_Model/Titanic_Model.rds")
# 
# # Make predictions with MOJO (production-ready)
# predictions <- h2o_predict_MOJO(
#   model_path = "models/Titanic_Model",
#   df = dft[1:10, ]
# )

## -----------------------------------------------------------------------------
# Quick prototype
model <- h2o_automl(dft, "Survived", max_models = 3, max_time = 30)

## -----------------------------------------------------------------------------
# Refine based on results
model <- h2o_automl(
  dft, "Survived",
  max_models = 20,
  no_outliers = TRUE,
  balance = TRUE,
  ignore = c("PassengerId", "Name", "Ticket", "Cabin"),
  model_name = "Titanic_Model"
)

## -----------------------------------------------------------------------------
# Check multiple metrics
model$metrics

# Visual inspection
mplot_full(
  tag = model$scores_test$tag,
  score = model$scores_test$score
)

# Variable importance
mplot_importance(
  var = model$importance$variable,
  imp = model$importance$importance
)

## -----------------------------------------------------------------------------
# Density plot
mplot_density(
  tag = model$scores_test$tag,
  score = model$scores_test$score
)

## -----------------------------------------------------------------------------
# Save everything
export_results(model, subdir = "my_project", thresh = 0.5)

## ----eval=FALSE---------------------------------------------------------------
# # Manually initialize h2o with more memory
# h2o::h2o.init(max_mem_size = "8G", nthreads = -1)

## -----------------------------------------------------------------------------
# Remove all models
h2o::h2o.removeAll()

# Shutdown h2o
h2o::h2o.shutdown(prompt = FALSE)

## -----------------------------------------------------------------------------
# Open h2o's web interface
# Navigate to: http://localhost:54321/flow/index.html