1 Introduction

This document explains the functionalities available in the a4Classif package.

This package contains for classification of Affymetrix microarray data, stored in an ExpressionSet. This package integrates within the Automated Affymetrix Array Analysis suite of packages.

## Loading required package: a4Core
## Loading required package: a4Preproc
## 
## a4Classif version 1.61.0
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
##     as.data.frame, basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

To demonstrate the functionalities of the package, the ALL dataset is used. The genes are annotated thanks to the addGeneInfo utility function of the a4Preproc package.

data(ALL, package = "ALL")
ALL <- addGeneInfo(ALL)
## Loading required package: hgu95av2.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     I, expand.grid, unname
## Loading required package: org.Hs.eg.db
## 
## 
ALL$BTtype <- as.factor(substr(ALL$BT,0,1))

2 Classify microarray data

2.1 Lasso regression

resultLasso <- lassoClass(object = ALL, groups = "BTtype")
plot(resultLasso, 
    label = TRUE, 
    main = "Lasso coefficients in relation to degree of penalization."
)

topTable(resultLasso, n = 15)
## The lasso selected 16 genes. The top 15 genes are:
## 
##             Gene Coefficient
## 38319_at    CD3D  0.95966733
## 35016_at    CD74 -0.60928095
## 38147_at  SH2D1A  0.49240967
## 35792_at    MGLL  0.46856925
## 37563_at  SRGAP3  0.26648240
## 38917_at  YME1L1  0.25100075
## 40278_at    GGA2 -0.25017550
## 41164_at    IGHM -0.12387272
## 41409_at THEMIS2 -0.10581122
## 38242_at    BLNK -0.10309606
## 35523_at   HPGDS  0.10169706
## 38949_at   PRKCQ  0.07832802
## 33316_at     TOX  0.06963509
## 33839_at   ITPR2  0.05801832
## 40570_at   FOXO1 -0.04858863

2.2 PAM regression

resultPam <- pamClass(object = ALL, groups = "BTtype")
plot(resultPam, 
    main = "Pam misclassification error versus number of genes."
)

topTable(resultPam, n = 15)
## Pam selected  67  genes. The top  15  genes are:
## 
##            GeneSymbol B.score T.score av.rank.in.CV prop.selected.in.CV
## 38319_at         CD3D -0.8467  2.4375             1                   1
## 38147_at       SH2D1A -0.5068  1.4589             2                   1
## 33238_at          LCK -0.4178  1.2027           4.2                   1
## 35016_at         CD74  0.4176 -1.2023           4.1                   1
## 38095_i_at   HLA-DPB1  0.4012  -1.155           4.8                   1
## 37039_at      HLA-DRA   0.396 -1.1399           5.8                   1
## 38096_f_at   HLA-DPB1  0.3826 -1.1015           7.1                   1
## 2059_s_at         LCK -0.3666  1.0555           7.7                   1
## 38833_at     HLA-DPA1  0.3344 -0.9628           9.2                   1
## 41723_s_at       <NA>  0.3076 -0.8855          10.9                   1
## 1110_at          TRDC -0.3022  0.8701          11.2                   1
## 38242_at         BLNK   0.281  -0.809          13.2                   1
## 1096_g_at        CD19    0.28 -0.8061          12.9                   1
## 37344_at      HLA-DMA  0.2727  -0.785          12.9                   1
## 39389_at          CD9  0.2635 -0.7585          14.8                   1
confusionMatrix(resultPam)
##     predicted
## true  B  T
##    B 95  0
##    T  0 33

2.3 Random forest

# select only a subset of the data for computation time reason
ALLSubset <- ALL[sample.int(n = nrow(ALL), size = 100, replace = FALSE), ]

resultRf <- rfClass(object = ALLSubset, groups = "BTtype")
plot(resultRf)

topTable(resultRf, n = 15)
## Random forest selected 21 genes. The top 15 genes are:
## 
##            GeneSymbol
## 1350_at        CYP4F2
## 160041_at      PTPN18
## 31590_g_at       <NA>
## 32633_at         ICA1
## 33752_at     IVNS1ABP
## 34788_at        EEIG1
## 34812_at      GORASP1
## 35350_at       CHST15
## 35792_at         MGLL
## 36852_at        TUSC3
## 37589_at   ZNF710-AS1
## 38429_at         FASN
## 38865_at        GRAP2
## 39382_at        TRIM2
## 39421_at         <NA>

2.4 ROC curve

ROCcurve(gene = "ABL1", object = ALL, groups = "BTtype")
## Warning in ROCcurve(gene = "ABL1", object = ALL, groups = "BTtype"): Gene ABL1 corresponds to 6 probesets; only the first probeset ( 1635_at ) has been displayed on the plot.

3 Appendix

3.1 Session information

## R version 4.6.0 RC (2026-04-17 r89917)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.24-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB              LC_COLLATE=C               LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] hgu95av2.db_3.13.0   org.Hs.eg.db_3.23.1  AnnotationDbi_1.75.0 IRanges_2.47.0       S4Vectors_0.51.0     ALL_1.53.0           Biobase_2.73.0       BiocGenerics_0.59.0  generics_0.1.4       a4Classif_1.61.0     a4Preproc_1.61.0     a4Core_1.61.0       
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10          varSelRF_0.7-9       shape_1.4.6.1        RSQLite_2.4.6        lattice_0.22-9       digest_0.6.39        evaluate_1.0.5       grid_4.6.0           iterators_1.0.14     fastmap_1.2.0        blob_1.3.0           foreach_1.5.2        jsonlite_2.0.0       glmnet_4.1-10        Matrix_1.7-5         DBI_1.3.0            survival_3.8-6       httr_1.4.8           Biostrings_2.81.0    codetools_0.2-20     jquerylib_0.1.4      cli_3.6.6            crayon_1.5.3        
## [24] rlang_1.2.0          XVector_0.53.0       pamr_1.57            bit64_4.8.0          splines_4.6.0        cachem_1.1.0         yaml_2.3.12          otel_0.2.0           tools_4.6.0          parallel_4.6.0       memoise_2.0.1        ROCR_1.0-12          png_0.1-9            vctrs_0.7.3          R6_2.6.1             lifecycle_1.0.5      Seqinfo_1.3.0        KEGGREST_1.53.0      randomForest_4.7-1.2 bit_4.6.0            cluster_2.1.8.2      pkgconfig_2.0.3      bslib_0.10.0        
## [47] Rcpp_1.1.1-1.1       xfun_0.57            knitr_1.51           htmltools_0.5.9      rmarkdown_2.31       compiler_4.6.0