Data_processing

The goal of SelectSim pacakge is to implement the SelectSim methodology to infer inter-dependencies between functional alterations in cancer.
functional package provides function to generate the backgorund model and other utilites functions.

Installation

You can install the development version of SelectSim from GitHub with:

# install.packages("devtools")
devtools::install_github("CSOgroup/SelectSim",dependencies = TRUE, build_vignettes = TRUE)

For more details on installation refer to INSTALLATION

Example

We will process LUAD MAF dataset from TCGA provided with the package.
We will also use oncokb v3.9 cancer genes and mutations to filter the maf to create the gam provided with the pacakge.
NOTE: This an example to prcoess the MAF file. To know more about how we prcossed the all MAF file to run_data object, please refer to SelectSim_analysis repository.

library(SelectSim)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tictoc)
## Load the data provided with the package
data(luad_maf, package = "SelectSim")
data(oncokb_genes, package = "SelectSim")
data(oncokb_truncating_genes, package = "SelectSim")
data(variant_catalogue, package = "SelectSim")

# Check the MAF
dim(luad_maf)
#> [1] 220734      8

Let print number of lines and number of samples

input_maf <- luad_maf
print(paste('##### Number of lines ####',nrow(input_maf),sep="->"))
#> [1] "##### Number of lines ####->220734"
genes_to_consider =  oncokb_genes
print(paste('##### Number of genes ####',length(genes_to_consider),sep="->"))
#> [1] "##### Number of genes ####->396"

Let create a table schema of mutations to conisder and columns defined in maf file

mutation_type = list(
      'ignore' = c("Silent","Intron","RNA","3'UTR","5'UTR","5'Flank","3'Flank","IGR"),
      'truncating'= c('Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins','Nonsense_Mutation','Nonstop_Mutation','Splice_Region','Splice_Site','Translation_Start_Site'),
      'missense' = c('Missense_Mutation')
)
custom_maf_schema = list(
    'name' = 'custom_maf',
    'column' = list(
          'gene' = 'Hugo_Symbol'
        , 'gene.name' = 'Hugo_Symbol'
        , 'sample' = 'sample'
        , 'sample.name' = 'sample'
        , 'mutation.type' = 'Variant_Classification'
        , 'mutation' = 'HGVSp_Short'
        ),
        'mutation.type' = mutation_type
)

Number of samples in the maf file

mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(paste('##### Number of samples ####',length(mut_samples),sep="->"))
#> [1] "##### Number of samples ####->502"

Filter the maf file to include oncokb cancer genes

maf_genes = filter_maf_gene.name(input_maf, genes = genes_to_consider, gene.col = custom_maf_schema$column$gene)
print(paste('##### Number of lines ####',nrow(maf_genes),sep="->"))
#> [1] "##### Number of lines ####->6708"

Genrating the GAMs

Let generate the truncating data
- We generate the GAM consider the genes to consider truncating mutations.
- We also create a TMB dataframe which takes all truncating mutation of all genes in consideration.

# Creating Truncating GAM
tic('##### Creating Truncating GAM ####')
    maf_trunc = filter_maf_truncating(maf_genes,genes=oncokb_truncating_genes, custom_maf_schema)
    input_maf_trunc<-filter_maf_truncating(input_maf, custom_maf_schema)
    truncating_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    rownames(truncating_tmb)<-mut_samples
    temp <- input_maf_trunc %>% count(sample) 
    rownames(temp)<-temp$sample
    truncating_tmb[intersect(truncating_tmb$sample,temp$sample),]$mutation <-temp[intersect(truncating_tmb$sample,temp$sample),'n']
    tcga_truc_gam = maf2gam(maf_trunc,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
                     binarize=TRUE,
                     fill=0)
    truncating_data <- list('gam'=tcga_truc_gam,
                            'tmb'=truncating_tmb)
toc()
#> ##### Creating Truncating GAM ####: 0.219 sec elapsed

Let generate the Missense data
- We generate the GAM with genes and hotspot mutations from oncokb v3.9.
- We also create a TMB dataframe which takes all Missense mutation of all genes in consideration.

# Creating Missense GAM
tic('##### Creating Missense GAM ####')
    maf_valid = filter_maf_schema(input_maf,
                             schema = custom_maf_schema,
                             column = 'mutation.type',
                             values = custom_maf_schema[['mutation.type']][['ignore']],
                             inclusive = FALSE)
    missense_maf<-filter_maf_mutation.type(input_maf,
                                      variants = 'Missense_Mutation',
                                      variant.col = custom_maf_schema$column$mutation.type)
    missense_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    rownames(missense_tmb)<-mut_samples
    temp <- missense_maf %>% count(sample) 
    rownames(temp)<-temp$sample
    missense_tmb[intersect(missense_tmb$sample,temp$sample),]$mutation <-temp[intersect(missense_tmb$sample,temp$sample),'n']
    t_m = substr(maf_valid[[custom_maf_schema$column$mutation]],3,1000)
    t_m1 =  gsub('[A-Z]*$', '', t_m)
    maf_valid$HGVSp_Short_fixed = t_m1
    maf_hotspot = filter_maf_mutations(maf_valid,
                                  variant_catalogue,
                                  maf.col = c(custom_maf_schema$column$gene, 'HGVSp_Short_fixed'),
                                  values.col = c('gene', 'mut'))

    missense_tcga_gam = maf2gam(maf_hotspot,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
                     binarize=TRUE,
                     fill=0)
    missesne_data <- list('gam'=missense_tcga_gam,
                          'tmb'=missense_tmb)

toc()
#> ##### Creating Missense GAM ####: 1.716 sec elapsed

Genrating the run_object to run SelectX

We create a run_object data which is list object which consists of
- M: a list object of GAMs which is presence absence matrix of alterations
- tmb: a list object of tumor mutation burden as data frame with column names (should be) as sample and mutationn
- sample.class a named vector of sample annotations
- alteration.class a named vector of alteration annotations

gene_to_take <- colnames(missesne_data$gam)
order <- rownames(missesne_data$gam)

data <-list('M'=list('missense'=t(missesne_data$gam[order,gene_to_take]),
                     'truncating'=t(truncating_data$gam[rownames(missesne_data$gam[order,]),gene_to_take])),
            'tmb'=list('missense'=missesne_data$tmb[order,],
                       'truncating'=truncating_data$tmb[order,]))

alteration_covariates <- rep('MUT',ncol(missesne_data$gam[order,gene_to_take]))
names(alteration_covariates)<-colnames(missesne_data$gam[order,gene_to_take])
sample_covariates<-rep('LUAD',length(order))
names(sample_covariates)<-order
run_data <- list('M'=data,'sample.class' = sample_covariates,'alteration.class' = alteration_covariates)
str(run_data)
#> List of 3
#>  $ M               :List of 2
#>   ..$ M  :List of 2
#>   .. ..$ missense  : num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. ..$ truncating: num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#>   .. .. ..- attr(*, "dimnames")=List of 2
#>   .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#>   .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   ..$ tmb:List of 2
#>   .. ..$ missense  :'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 163 253 270 1328 100 ...
#>   .. ..$ truncating:'data.frame':    502 obs. of  2 variables:
#>   .. .. ..$ sample  : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>   .. .. ..$ mutation: num [1:502] 24 45 40 206 17 18 73 31 176 108 ...
#>  $ sample.class    : Named chr [1:502] "LUAD" "LUAD" "LUAD" "LUAD" ...
#>   ..- attr(*, "names")= chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#>  $ alteration.class: Named chr [1:396] "MUT" "MUT" "MUT" "MUT" ...
#>   ..- attr(*, "names")= chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...

Save the run_data and check the introduction vignette to see how to run selectX to discover EDs.

SessionInfo

# Print the sessionInfo
sessionInfo()
#> R version 4.3.2 (2023-10-31)
#> Platform: x86_64-conda-linux-gnu (64-bit)
#> Running under: CentOS Linux 7 (Core)
#> 
#> Matrix products: default
#> BLAS/LAPACK: /mnt/ndata/arvind/envs/R_4/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Zurich
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] tictoc_1.2        dplyr_1.1.4       SelectSim_0.0.1.3
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.4       xfun_0.41          bslib_0.6.1        ggplot2_3.4.4     
#>  [5] rstatix_0.7.2      lattice_0.22-5     vctrs_0.6.5        tools_4.3.2       
#>  [9] generics_0.1.3     parallel_4.3.2     tibble_3.2.1       fansi_1.0.6       
#> [13] pkgconfig_2.0.3    Matrix_1.6-5       desc_1.4.3         ggridges_0.5.5    
#> [17] rngtools_1.5.2     RcppParallel_5.1.6 lifecycle_1.0.4    compiler_4.3.2    
#> [21] stringr_1.5.1      textshaping_0.3.7  munsell_0.5.0      codetools_0.2-19  
#> [25] carData_3.0-5      htmltools_0.5.7    sass_0.4.8         yaml_2.3.8        
#> [29] pillar_1.9.0       pkgdown_2.0.7      car_3.1-2          ggpubr_0.6.0      
#> [33] jquerylib_0.1.4    tidyr_1.3.0        cachem_1.0.8       doRNG_1.8.6       
#> [37] iterators_1.0.14   abind_1.4-5        foreach_1.5.2      tidyselect_1.2.0  
#> [41] digest_0.6.34      stringi_1.8.3      reshape2_1.4.4     purrr_1.0.2       
#> [45] fastmap_1.1.1      grid_4.3.2         colorspace_2.1-0   cli_3.6.2         
#> [49] magrittr_2.0.3     Rfast_2.1.0        utf8_1.2.4         broom_1.0.5       
#> [53] scales_1.3.0       backports_1.4.1    RcppZiggurat_0.1.6 rmarkdown_2.25    
#> [57] ggsignif_0.6.4     ragg_1.2.7         memoise_2.0.1      evaluate_0.23     
#> [61] knitr_1.45         doParallel_1.0.17  rlang_1.1.3        Rcpp_1.0.12       
#> [65] glue_1.7.0         jsonlite_1.8.8     R6_2.5.1           plyr_1.8.9        
#> [69] systemfonts_1.0.5  fs_1.6.3

Arvind Iyer

2024-08-23

Installation

Example

Genrating the GAMs

Genrating the run_object to run SelectX

SessionInfo