  • The goal of SelectSim pacakge is to implement the SelectSim methodology to infer inter-dependencies between functional alterations in cancer.
  • functional package provides function to generate the backgorund model and other utilites functions.


  • You can install the development version of SelectSim from GitHub with:
# install.packages("devtools")
devtools::install_github("CSOgroup/SelectSim",dependencies = TRUE, build_vignettes = TRUE)


  • We will process LUAD MAF dataset from TCGA provided with the package.

  • We will also use oncokb v3.9 cancer genes and mutations to filter the maf to create the gam provided with the pacakge.

  • NOTE: This an example to prcoess the MAF file. To know more about how we prcossed the all MAF file to run_data object, please refer to SelectSim_analysis repository.

## Load the data provided with the package
data(luad_maf, package = "SelectSim")
data(oncokb_genes, package = "SelectSim")
data(oncokb_truncating_genes, package = "SelectSim")
data(variant_catalogue, package = "SelectSim")
# Check the MAF
#> [1] 220734      8
  • Let print number of lines and number of samples
input_maf <- luad_maf
print(paste('##### Number of lines ####',nrow(input_maf),sep="->"))
#> [1] "##### Number of lines ####->220734"
genes_to_consider =  oncokb_genes
print(paste('##### Number of genes ####',length(genes_to_consider),sep="->"))
#> [1] "##### Number of genes ####->396"
  • Let create a table schema of mutations to conisder and columns defined in maf file
mutation_type = list(
      'ignore' = c("Silent","Intron","RNA","3'UTR","5'UTR","5'Flank","3'Flank","IGR"),
      'truncating'= c('Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins','Nonsense_Mutation','Nonstop_Mutation','Splice_Region','Splice_Site','Translation_Start_Site'),
      'missense' = c('Missense_Mutation')
custom_maf_schema = list(
    'name' = 'custom_maf',
    'column' = list(
          'gene' = 'Hugo_Symbol'
        , '' = 'Hugo_Symbol'
        , 'sample' = 'sample'
        , '' = 'sample'
        , 'mutation.type' = 'Variant_Classification'
        , 'mutation' = 'HGVSp_Short'
        'mutation.type' = mutation_type
  • Number of samples in the maf file
mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(paste('##### Number of samples ####',length(mut_samples),sep="->"))
#> [1] "##### Number of samples ####->502"
  • Filter the maf file to include oncokb cancer genes
maf_genes =, genes = genes_to_consider, gene.col = custom_maf_schema$column$gene)
print(paste('##### Number of lines ####',nrow(maf_genes),sep="->"))
#> [1] "##### Number of lines ####->6708"

Genrating the GAMs

  • Let generate the truncating data
    • We generate the GAM consider the genes to consider truncating mutations.
    • We also create a TMB dataframe which takes all truncating mutation of all genes in consideration.
# Creating Truncating GAM
tic('##### Creating Truncating GAM ####')
    maf_trunc = filter_maf_truncating(maf_genes,genes=oncokb_truncating_genes, custom_maf_schema)
    input_maf_trunc<-filter_maf_truncating(input_maf, custom_maf_schema)
    truncating_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    temp <- input_maf_trunc %>% count(sample) 
    truncating_tmb[intersect(truncating_tmb$sample,temp$sample),]$mutation <-temp[intersect(truncating_tmb$sample,temp$sample),'n']
    tcga_truc_gam = maf2gam(maf_trunc,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
    truncating_data <- list('gam'=tcga_truc_gam,
#> ##### Creating Truncating GAM ####: 0.219 sec elapsed
  • Let generate the Missense data
    • We generate the GAM with genes and hotspot mutations from oncokb v3.9.
    • We also create a TMB dataframe which takes all Missense mutation of all genes in consideration.
# Creating Missense GAM
tic('##### Creating Missense GAM ####')
    maf_valid = filter_maf_schema(input_maf,
                             schema = custom_maf_schema,
                             column = 'mutation.type',
                             values = custom_maf_schema[['mutation.type']][['ignore']],
                             inclusive = FALSE)
                                      variants = 'Missense_Mutation',
                                      variant.col = custom_maf_schema$column$mutation.type)
    missense_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    temp <- missense_maf %>% count(sample) 
    missense_tmb[intersect(missense_tmb$sample,temp$sample),]$mutation <-temp[intersect(missense_tmb$sample,temp$sample),'n']
    t_m = substr(maf_valid[[custom_maf_schema$column$mutation]],3,1000)
    t_m1 =  gsub('[A-Z]*$', '', t_m)
    maf_valid$HGVSp_Short_fixed = t_m1
    maf_hotspot = filter_maf_mutations(maf_valid,
                                  maf.col = c(custom_maf_schema$column$gene, 'HGVSp_Short_fixed'),
                                  values.col = c('gene', 'mut'))

    missense_tcga_gam = maf2gam(maf_hotspot,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
    missesne_data <- list('gam'=missense_tcga_gam,

#> ##### Creating Missense GAM ####: 1.716 sec elapsed

Genrating the run_object to run SelectX

  • We create a run_object data which is list object which consists of
    • M: a list object of GAMs which is presence absence matrix of alterations
    • tmb: a list object of tumor mutation burden as data frame with column names (should be) as sample and mutationn
    • sample.class a named vector of sample annotations
    • alteration.class a named vector of alteration annotations
gene_to_take <- colnames(missesne_data$gam)
order <- rownames(missesne_data$gam)

data <-list('M'=list('missense'=t(missesne_data$gam[order,gene_to_take]),

alteration_covariates <- rep('MUT',ncol(missesne_data$gam[order,gene_to_take]))
run_data <- list('M'=data,'sample.class' = sample_covariates,'alteration.class' = alteration_covariates)
#> List of 3
#>  $ M               :List of 2
#>   ..$ M  :List of 2
#>   .. ..$ missense  : num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. ..$ truncating: num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   ..$ tmb:List of 2
#>   .. ..$ missense  :'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 163 253 270 1328 100 ...
#>   .. ..$ truncating:'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 24 45 40 206 17 18 73 31 176 108 ...
#>  $ sample.class    : Named chr [1:502] "LUAD" "LUAD" "LUAD" "LUAD" ...
#>   ..- attr(*, "names")= chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>  $ alteration.class: Named chr [1:396] "MUT" "MUT" "MUT" "MUT" ...
#>   ..- attr(*, "names")= chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
  • Save the run_data and check the introduction vignette to see how to run selectX to discover EDs.


