Skip to contents
  • The goal of SelectSim package is to implement the SelectSim methodology to infer inter-dependencies between functional alterations in cancer.
  • SelectSim package provides functions to generate the background model and other utility functions.

Installation

  • You can install the development version of SelectSim from GitHub with:
# install.packages("devtools")
devtools::install_github("CSOgroup/SelectSim",dependencies = TRUE, build_vignettes = TRUE)

Example

  • We will process LUAD MAF dataset from TCGA provided with the package.

  • We will also use oncokb v3.9 cancer genes and mutations to filter the maf to create the gam provided with the package.

  • NOTE: This is an example to process the MAF file. To know more about how we processed the MAF file to run_data object, please refer to SelectSim_analysis repository.

library(SelectSim)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
if (requireNamespace("tictoc", quietly = TRUE)) library(tictoc)
## Load the data provided with the package
data(luad_maf, package = "SelectSim")
data(oncokb_genes, package = "SelectSim")
data(oncokb_truncating_genes, package = "SelectSim")
data(variant_catalogue, package = "SelectSim")
# Check the MAF
dim(luad_maf)
#> [1] 220734      8
  • Let print number of lines and number of samples
input_maf <- luad_maf
print(paste('##### Number of lines ####',nrow(input_maf),sep="->"))
#> [1] "##### Number of lines ####->220734"
genes_to_consider =  oncokb_genes
print(paste('##### Number of genes ####',length(genes_to_consider),sep="->"))
#> [1] "##### Number of genes ####->396"
  • Let create a table schema of mutations to consider and columns defined in maf file
mutation_type = list(
      'ignore' = c("Silent","Intron","RNA","3'UTR","5'UTR","5'Flank","3'Flank","IGR"),
      'truncating'= c('Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins','Nonsense_Mutation','Nonstop_Mutation','Splice_Region','Splice_Site','Translation_Start_Site'),
      'missense' = c('Missense_Mutation')
)
custom_maf_schema = list(
    'name' = 'custom_maf',
    'column' = list(
          'gene' = 'Hugo_Symbol'
        , 'gene.name' = 'Hugo_Symbol'
        , 'sample' = 'sample'
        , 'sample.name' = 'sample'
        , 'mutation.type' = 'Variant_Classification'
        , 'mutation' = 'HGVSp_Short'
        ),
        'mutation.type' = mutation_type
)
  • Number of samples in the maf file
mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(paste('##### Number of samples ####',length(mut_samples),sep="->"))
#> [1] "##### Number of samples ####->502"
  • Filter the maf file to include oncokb cancer genes
maf_genes = filter_maf_gene.name(input_maf, genes = genes_to_consider, gene.col = custom_maf_schema$column$gene)
print(paste('##### Number of lines ####',nrow(maf_genes),sep="->"))
#> [1] "##### Number of lines ####->6708"

Generating the GAMs

  • Let generate the truncating data
    • We generate the GAM consider the genes to consider truncating mutations.
    • We also create a TMB dataframe which takes all truncating mutation of all genes in consideration.
# Creating Truncating GAM
if (requireNamespace("tictoc", quietly = TRUE)) tictoc::tic('##### Creating Truncating GAM ####')
    maf_trunc = filter_maf_truncating(maf_genes,genes=oncokb_truncating_genes, custom_maf_schema)
    input_maf_trunc<-filter_maf_truncating(input_maf, custom_maf_schema)
    truncating_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    rownames(truncating_tmb)<-mut_samples
    temp <- input_maf_trunc %>% count(sample)
    rownames(temp)<-temp$sample
    truncating_tmb[intersect(truncating_tmb$sample,temp$sample),]$mutation <-temp[intersect(truncating_tmb$sample,temp$sample),'n']
    tcga_truc_gam = maf2gam(maf_trunc,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
                     binarize=TRUE,
                     fill=0)
    truncating_data <- list('gam'=tcga_truc_gam,
                            'tmb'=truncating_tmb)
if (requireNamespace("tictoc", quietly = TRUE)) tictoc::toc()
#> ##### Creating Truncating GAM ####: 0.165 sec elapsed
  • Let generate the Missense data
    • We generate the GAM with genes and hotspot mutations from oncokb v3.9.
    • We also create a TMB dataframe which takes all Missense mutation of all genes in consideration.
# Creating Missense GAM
if (requireNamespace("tictoc", quietly = TRUE)) tictoc::tic('##### Creating Missense GAM ####')
    maf_valid = filter_maf_schema(input_maf,
                             schema = custom_maf_schema,
                             column = 'mutation.type',
                             values = custom_maf_schema[['mutation.type']][['ignore']],
                             inclusive = FALSE)
    missense_maf<-filter_maf_mutation.type(input_maf,
                                      variants = 'Missense_Mutation',
                                      variant.col = custom_maf_schema$column$mutation.type)
    missense_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    rownames(missense_tmb)<-mut_samples
    temp <- missense_maf %>% count(sample) 
    rownames(temp)<-temp$sample
    missense_tmb[intersect(missense_tmb$sample,temp$sample),]$mutation <-temp[intersect(missense_tmb$sample,temp$sample),'n']
    t_m = substr(maf_valid[[custom_maf_schema$column$mutation]],3,1000)
    t_m1 =  gsub('[A-Z]*$', '', t_m)
    maf_valid$HGVSp_Short_fixed = t_m1
    maf_hotspot = filter_maf_mutations(maf_valid,
                                  variant_catalogue,
                                  maf.col = c(custom_maf_schema$column$gene, 'HGVSp_Short_fixed'),
                                  values.col = c('gene', 'mut'))

    missense_tcga_gam = maf2gam(maf_hotspot,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
                     binarize=TRUE,
                     fill=0)
    missesne_data <- list('gam'=missense_tcga_gam,
                          'tmb'=missense_tmb)

if (requireNamespace("tictoc", quietly = TRUE)) tictoc::toc()
#> ##### Creating Missense GAM ####: 1.62 sec elapsed

Generating the run_object to run SelectSim

  • We create a run_object data which is list object which consists of
    • M: a list object of GAMs which is presence absence matrix of alterations
    • tmb: a list object of tumor mutation burden as data frame with column names (should be) as sample and mutation
    • sample.class a named vector of sample annotations
    • alteration.class a named vector of alteration annotations
gene_to_take <- colnames(missesne_data$gam)
order <- rownames(missesne_data$gam)

data <-list('M'=list('missense'=t(missesne_data$gam[order,gene_to_take]),
                     'truncating'=t(truncating_data$gam[rownames(missesne_data$gam[order,]),gene_to_take])),
            'tmb'=list('missense'=missesne_data$tmb[order,],
                       'truncating'=truncating_data$tmb[order,]))

alteration_covariates <- rep('MUT',ncol(missesne_data$gam[order,gene_to_take]))
names(alteration_covariates)<-colnames(missesne_data$gam[order,gene_to_take])
sample_covariates<-rep('LUAD',length(order))
names(sample_covariates)<-order
run_data <- list('M'=data,'sample.class' = sample_covariates,'alteration.class' = alteration_covariates)
str(run_data)
#> List of 3
#>  $ M               :List of 2
#>   ..$ M  :List of 2
#>   .. ..$ missense  : num [1:396, 1:502] NA NA NA NA NA NA NA NA NA NA ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. ..$ truncating: num [1:396, 1:502] 0 NA NA NA NA NA NA NA NA 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   ..$ tmb:List of 2
#>   .. ..$ missense  :'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 163 253 270 1328 100 ...
#>   .. ..$ truncating:'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 24 45 40 206 17 18 73 31 176 108 ...
#>  $ sample.class    : Named chr [1:502] "LUAD" "LUAD" "LUAD" "LUAD" ...
#>   ..- attr(*, "names")= chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>  $ alteration.class: Named chr [1:396] "MUT" "MUT" "MUT" "MUT" ...
#>   ..- attr(*, "names")= chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
  • Save the run_data and check the introduction vignette to see how to run selectX to discover EDs.

SessionInfo

# Print the sessionInfo
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] tictoc_1.2.1    dplyr_1.2.1     SelectSim_0.1.6
#> 
#> loaded via a namespace (and not attached):
#>  [1] sass_0.4.10           generics_0.1.4        tidyr_1.3.2          
#>  [4] rstatix_0.7.3         lattice_0.22-9        digest_0.6.39        
#>  [7] magrittr_2.0.5        evaluate_1.0.5        grid_4.6.0           
#> [10] RColorBrewer_1.1-3    iterators_1.0.14      fastmap_1.2.0        
#> [13] Matrix_1.7-5          foreach_1.5.2         doParallel_1.0.17    
#> [16] jsonlite_2.0.0        backports_1.5.1       Formula_1.2-5        
#> [19] purrr_1.2.2           doRNG_1.8.6.3         scales_1.4.0         
#> [22] codetools_0.2-20      textshaping_1.0.5     jquerylib_0.1.4      
#> [25] abind_1.4-8           cli_3.6.6             zigg_0.0.2           
#> [28] rlang_1.2.0           cachem_1.1.0          yaml_2.3.12          
#> [31] otel_0.2.0            tools_4.6.0           parallel_4.6.0       
#> [34] ggsignif_0.6.4        ggplot2_4.0.3         ggpubr_0.6.3         
#> [37] rngtools_1.5.2        Rfast_2.1.5.2         broom_1.0.13         
#> [40] vctrs_0.7.3           R6_2.6.1              ggridges_0.5.7       
#> [43] lifecycle_1.0.5       fs_2.1.0              car_3.1-5            
#> [46] ragg_1.5.2            pkgconfig_2.0.3       desc_1.4.3           
#> [49] RcppParallel_5.1.11-2 pkgdown_2.2.0         bslib_0.11.0         
#> [52] pillar_1.11.1         gtable_0.3.6          Rcpp_1.1.1-1.1       
#> [55] glue_1.8.1            systemfonts_1.3.2     xfun_0.58            
#> [58] tibble_3.3.1          tidyselect_1.2.1      knitr_1.51           
#> [61] farver_2.1.2          htmltools_0.5.9       rmarkdown_2.31       
#> [64] carData_3.0-6         compiler_4.6.0        S7_0.2.2