## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(wdiexplorer) ## ----search-indicator, eval=FALSE--------------------------------------------- # # WDI::WDIsearch("air pollution") ## ----get-data----------------------------------------------------------------- pm_data <- get_wdi_data(indicator = "EN.ATM.PM25.MC.M3", verbose = TRUE) ## ----------------------------------------------------------------------------- dplyr::glimpse(pm_data) ## ----missingness-plot, fig.width=7.5, fig.height=11.5, fig.cap="Missingness plot, providing information about the years and countries with missing entries and the overall percentages of missing and present data. It also shows that no data points are available across all countries during the years 1960 to 1989 and 2021 to 2024."---- plot_missing(wdi_data = pm_data, group_var = "region") ## ----missingness-------------------------------------------------------------- index = "EN.ATM.PM25.MC.M3" pm_data |> dplyr::select(country, region, year, tidyselect::all_of(index)) |> dplyr::group_by(region, country) |> naniar::miss_var_summary() |> dplyr::filter(variable == index) |> dplyr::arrange(desc(n_miss)) ## ----checks-function---------------------------------------------------------- get_valid_data(pm_data, verbose = TRUE) ## ----variation---------------------------------------------------------------- pm_diss_mat <- compute_dissimilarity(pm_data) pm_variation <- compute_variation( pm_data, diss_matrix = pm_diss_mat, group_var = "region" ) ## ----dissimilarities---------------------------------------------------------- pm_variation |> dplyr::arrange(desc(country_avg_dist)) |> dplyr::slice_head(n = 3) ## ----trend-shape-------------------------------------------------------------- pm_trend_shape <- compute_trend_shape_features(pm_data) ## ----trend-strength----------------------------------------------------------- pm_trend_shape |> dplyr::arrange(desc(trend_strength)) |> dplyr::slice_head(n = 3) ## ----temporal----------------------------------------------------------------- pm_temporal <- compute_temporal_features(pm_data) ## ----flat-spot---------------------------------------------------------------- pm_temporal |> dplyr::arrange(desc(flat_spot)) |> dplyr::slice(c(1:3, (dplyr::n() - 2):dplyr::n())) ## ----diagnostic-metrics------------------------------------------------------- pm_diagnostic_metrics <- compute_diagnostic_indices(pm_data, group_var = "region") ## ----add-group---------------------------------------------------------------- pm_diagnostic_metrics_group <- add_group_info( metric_summary = pm_diagnostic_metrics, pm_data ) ## ----distribution-plot1, fig.height=5, fig.cap="Distribution of diagnostic indices where each panel represents a different metric. It shows the spread of the metric values across countries, with each dot representing a country and coloured by region. Countries in the North America region stand out with the lowest within-group average dissimilarity and the highest silhouette width values."---- # ungrouped distribution plot plot_metric_distribution( metric_summary = pm_diagnostic_metrics_group, colour_var = "region" ) ## ----distribution-plot2, fig.height=5, fig.cap="Distribution of diagnostic indices grouped by region. Each panel displays a metric, with countries organised by region to facilitate within and between group comparisons. The plot reveals region-specific patterns and outliers. Sub-Saharan Africa and East Asia & Pacific regions show wider spread while North America region are closely knitted."---- # grouped distribution plot plot_metric_distribution( metric_summary = pm_diagnostic_metrics_group, colour_var = "region", group_var = "region" ) ## ----linearity-dist, fig.height=5, fig.cap="Distribution of the linearity metric coloured by region."---- # ungrouped distribution plot for linearity metric plot_metric_distribution( metric_summary = pm_diagnostic_metrics_group, metric_var = "linearity", colour_var = "region" ) ## ----linearity-curvature-dist, fig.height=5, fig.cap="Distribution of the linearity and curvature metrics coloured by region and grouped by region."---- # grouped distribution plot for linearity and curvature metrics plot_metric_distribution( metric_summary = pm_diagnostic_metrics_group, metric_var = c("linearity", "curvature"), colour_var = "region", group_var = "region" ) ## ----partition plot, fig.width=7.5, fig.height=11.5, fig.cap = "Country silhouette widths, grouped by region, with the average silhouette width for each region underlaid beneath the country bars. Countries in Sub-Saharan Africa and East Asia & Pacific regions all exhibit negative silhouette widths, suggesting that they do not fit well within their assigned regional groupings based on their data series, or that their behaviour may be more similar to countries in other regions."---- plot_metric_partition( metric_summary = pm_diagnostic_metrics_group, metric_var = "sil_width", group_var = "region" ) ## ----trajectories-plot1, fig.height=3, fig.cap="The country line plots of PM2.5 air pollution dataset. Hovering over each line displays the corresponding country name."---- # ungrouped version plot_data_trajectories(pm_data) ## ----trajectories-plot2, fig.height=5, fig.cap="The PM2.5 air pollution data trajectories faceted by region groupings."---- # grouped version plot_data_trajectories(pm_data, group_var = "region") ## ----metric-trajectories-plot1, fig.height=3, fig.cap="The country line plots of PM2.5 air pollution dataset. Countries with average dissimilarity distance values below or at the 95th percentile are displayed in grey, while countries with the top 5% average dissimilarity between itself and other countries are highlighted using a colour gradient. Qatar and Niger, countries displayed in purple-blue exhibit the highest dissimilarity values."---- # ungrouped version plot_data_trajectories( pm_data, metric_summary = pm_diagnostic_metrics, metric_var = "country_avg_dist" ) ## ----metric-trajectories-plot2, fig.height=5, fig.cap="The PM2.5 air pollution data trajectories faceted by region groupings with group-based threshold computations rather than a uniform global threshold for highlighting countries with the top percentile. Qatar stood out with the highest dissimilarity values across other countries in Middle East & North Africa while Niger, Mauritania and Senegal are identified as countries with the highest dissimilarity within the Sub-Saharan Africa region."---- # grouped version plot_data_trajectories( pm_data, metric_summary = pm_diagnostic_metrics_group, metric_var = "within_group_avg_dist", group_var = "region" ) ## ----parallel-plot1, fig.height=5, fig.cap = "The static version of the parallel coordinate plot displaying the metric values across all the diagnostic indices. The metric values are normalised to a scale of 0 to 1. Countries in Sub-Saharan Africa region, shown in magenta, display a wide spread across most diagnostics indices."---- plot_parallel_coords( diagnostic_summary = pm_diagnostic_metrics_group, colour_var = "region" ) ## ----parallel-plot2, fig.height=5, fig.cap="The static version of the parallel coordinate plot displaying the metric values across all diagnostic indices grouped by region. The metric values are normalised to a scale of 0 to 1 within each group. Countries in Sub-Saharan Africa region, shown in magenta, display a wide spread across most diagnostics indices."---- plot_parallel_coords( diagnostic_summary = pm_diagnostic_metrics_group, colour_var = "region", group_var = "region" ) ## ----link-view1, fig.height=3.5, fig.cap="The static version of the interactive link-based plot showing the relationship between linearity and curvature metrics across all countries. Each point in the scatterplot represents a country, and hovering a point reveals its corresponding data series."---- # ungrouped version plot_metric_linkview( pm_data, metric_summary = pm_diagnostic_metrics, metric_var = c("linearity", "curvature") ) ## ----link-view2, fig.height=5, fig.cap="The static version of the grouped link-based plot showing the relationship between linearity and curvature metrics across all countries faceted by region. Each point in the scatterplot represents a country, and hovering a point reveals its corresponding data series in its panel."---- # grouped version plot_metric_linkview( pm_data, metric_summary = pm_diagnostic_metrics_group, metric_var = c("linearity", "curvature"), group_var = "region" )