Data_processing
Arvind Iyer
2024-08-23
data_processing.Rmd
- The goal of SelectSim pacakge is to implement the SelectSim methodology to infer inter-dependencies between functional alterations in cancer.
-
functional
package provides function to generate the backgorund model and other utilites functions.
Installation
- You can install the development version of SelectSim from GitHub with:
# install.packages("devtools")
devtools::install_github("CSOgroup/SelectSim",dependencies = TRUE, build_vignettes = TRUE)
- For more details on installation refer to INSTALLATION
Example
We will process LUAD MAF dataset from TCGA provided with the package.
We will also use oncokb v3.9 cancer genes and mutations to filter the maf to create the gam provided with the pacakge.
NOTE: This an example to prcoess the MAF file. To know more about how we prcossed the all MAF file to run_data object, please refer to SelectSim_analysis repository.
library(SelectSim)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tictoc)
## Load the data provided with the package
data(luad_maf, package = "SelectSim")
data(oncokb_genes, package = "SelectSim")
data(oncokb_truncating_genes, package = "SelectSim")
data(variant_catalogue, package = "SelectSim")
# Check the MAF
dim(luad_maf)
#> [1] 220734 8
- Let print number of lines and number of samples
input_maf <- luad_maf
print(paste('##### Number of lines ####',nrow(input_maf),sep="->"))
#> [1] "##### Number of lines ####->220734"
genes_to_consider = oncokb_genes
print(paste('##### Number of genes ####',length(genes_to_consider),sep="->"))
#> [1] "##### Number of genes ####->396"
- Let create a table schema of mutations to conisder and columns defined in maf file
mutation_type = list(
'ignore' = c("Silent","Intron","RNA","3'UTR","5'UTR","5'Flank","3'Flank","IGR"),
'truncating'= c('Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins','Nonsense_Mutation','Nonstop_Mutation','Splice_Region','Splice_Site','Translation_Start_Site'),
'missense' = c('Missense_Mutation')
)
custom_maf_schema = list(
'name' = 'custom_maf',
'column' = list(
'gene' = 'Hugo_Symbol'
, 'gene.name' = 'Hugo_Symbol'
, 'sample' = 'sample'
, 'sample.name' = 'sample'
, 'mutation.type' = 'Variant_Classification'
, 'mutation' = 'HGVSp_Short'
),
'mutation.type' = mutation_type
)
- Number of samples in the maf file
mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(paste('##### Number of samples ####',length(mut_samples),sep="->"))
#> [1] "##### Number of samples ####->502"
- Filter the maf file to include oncokb cancer genes
maf_genes = filter_maf_gene.name(input_maf, genes = genes_to_consider, gene.col = custom_maf_schema$column$gene)
print(paste('##### Number of lines ####',nrow(maf_genes),sep="->"))
#> [1] "##### Number of lines ####->6708"
Genrating the GAMs
- Let generate the truncating data
- We generate the GAM consider the genes to consider truncating mutations.
- We also create a TMB dataframe which takes all truncating mutation of all genes in consideration.
# Creating Truncating GAM
tic('##### Creating Truncating GAM ####')
maf_trunc = filter_maf_truncating(maf_genes,genes=oncokb_truncating_genes, custom_maf_schema)
input_maf_trunc<-filter_maf_truncating(input_maf, custom_maf_schema)
truncating_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
rownames(truncating_tmb)<-mut_samples
temp <- input_maf_trunc %>% count(sample)
rownames(temp)<-temp$sample
truncating_tmb[intersect(truncating_tmb$sample,temp$sample),]$mutation <-temp[intersect(truncating_tmb$sample,temp$sample),'n']
tcga_truc_gam = maf2gam(maf_trunc,
sample.col = custom_maf_schema$column$sample,
gene.col = custom_maf_schema$column$gene,
value.var = 'Variant_Classification',
samples = mut_samples,
genes = genes_to_consider,
fun.aggregate = length,
binarize=TRUE,
fill=0)
truncating_data <- list('gam'=tcga_truc_gam,
'tmb'=truncating_tmb)
toc()
#> ##### Creating Truncating GAM ####: 0.219 sec elapsed
- Let generate the Missense data
- We generate the GAM with genes and hotspot mutations from oncokb v3.9.
- We also create a TMB dataframe which takes all Missense mutation of all genes in consideration.
# Creating Missense GAM
tic('##### Creating Missense GAM ####')
maf_valid = filter_maf_schema(input_maf,
schema = custom_maf_schema,
column = 'mutation.type',
values = custom_maf_schema[['mutation.type']][['ignore']],
inclusive = FALSE)
missense_maf<-filter_maf_mutation.type(input_maf,
variants = 'Missense_Mutation',
variant.col = custom_maf_schema$column$mutation.type)
missense_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
rownames(missense_tmb)<-mut_samples
temp <- missense_maf %>% count(sample)
rownames(temp)<-temp$sample
missense_tmb[intersect(missense_tmb$sample,temp$sample),]$mutation <-temp[intersect(missense_tmb$sample,temp$sample),'n']
t_m = substr(maf_valid[[custom_maf_schema$column$mutation]],3,1000)
t_m1 = gsub('[A-Z]*$', '', t_m)
maf_valid$HGVSp_Short_fixed = t_m1
maf_hotspot = filter_maf_mutations(maf_valid,
variant_catalogue,
maf.col = c(custom_maf_schema$column$gene, 'HGVSp_Short_fixed'),
values.col = c('gene', 'mut'))
missense_tcga_gam = maf2gam(maf_hotspot,
sample.col = custom_maf_schema$column$sample,
gene.col = custom_maf_schema$column$gene,
value.var = 'Variant_Classification',
samples = mut_samples,
genes = genes_to_consider,
fun.aggregate = length,
binarize=TRUE,
fill=0)
missesne_data <- list('gam'=missense_tcga_gam,
'tmb'=missense_tmb)
toc()
#> ##### Creating Missense GAM ####: 1.716 sec elapsed
Genrating the run_object to run SelectX
- We create a run_object data which is list object which consists of
- M: a list object of GAMs which is presence absence matrix of alterations
- tmb: a list object of tumor mutation burden as data frame with column names (should be) as sample and mutationn
- sample.class a named vector of sample annotations
- alteration.class a named vector of alteration annotations
gene_to_take <- colnames(missesne_data$gam)
order <- rownames(missesne_data$gam)
data <-list('M'=list('missense'=t(missesne_data$gam[order,gene_to_take]),
'truncating'=t(truncating_data$gam[rownames(missesne_data$gam[order,]),gene_to_take])),
'tmb'=list('missense'=missesne_data$tmb[order,],
'truncating'=truncating_data$tmb[order,]))
alteration_covariates <- rep('MUT',ncol(missesne_data$gam[order,gene_to_take]))
names(alteration_covariates)<-colnames(missesne_data$gam[order,gene_to_take])
sample_covariates<-rep('LUAD',length(order))
names(sample_covariates)<-order
run_data <- list('M'=data,'sample.class' = sample_covariates,'alteration.class' = alteration_covariates)
str(run_data)
#> List of 3
#> $ M :List of 2
#> ..$ M :List of 2
#> .. ..$ missense : num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#> .. .. ..- attr(*, "dimnames")=List of 2
#> .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#> .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#> .. ..$ truncating: num [1:396, 1:502] 0 0 0 0 0 0 0 0 0 0 ...
#> .. .. ..- attr(*, "dimnames")=List of 2
#> .. .. .. ..$ : chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
#> .. .. .. ..$ : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#> ..$ tmb:List of 2
#> .. ..$ missense :'data.frame': 502 obs. of 2 variables:
#> .. .. ..$ sample : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#> .. .. ..$ mutation: num [1:502] 163 253 270 1328 100 ...
#> .. ..$ truncating:'data.frame': 502 obs. of 2 variables:
#> .. .. ..$ sample : chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#> .. .. ..$ mutation: num [1:502] 24 45 40 206 17 18 73 31 176 108 ...
#> $ sample.class : Named chr [1:502] "LUAD" "LUAD" "LUAD" "LUAD" ...
#> ..- attr(*, "names")= chr [1:502] "TCGA-05-4244-01" "TCGA-05-4249-01" "TCGA-05-4250-01" "TCGA-05-4382-01" ...
#> $ alteration.class: Named chr [1:396] "MUT" "MUT" "MUT" "MUT" ...
#> ..- attr(*, "names")= chr [1:396] "AKT1" "ATM" "BRAF" "CDKN2A" ...
- Save the
run_data
and check the introduction vignette to see how to run selectX to discover EDs.
SessionInfo
# Print the sessionInfo
sessionInfo()
#> R version 4.3.2 (2023-10-31)
#> Platform: x86_64-conda-linux-gnu (64-bit)
#> Running under: CentOS Linux 7 (Core)
#>
#> Matrix products: default
#> BLAS/LAPACK: /mnt/ndata/arvind/envs/R_4/lib/libopenblasp-r0.3.25.so; LAPACK version 3.11.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Europe/Zurich
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] tictoc_1.2 dplyr_1.1.4 SelectSim_0.0.1.3
#>
#> loaded via a namespace (and not attached):
#> [1] gtable_0.3.4 xfun_0.41 bslib_0.6.1 ggplot2_3.4.4
#> [5] rstatix_0.7.2 lattice_0.22-5 vctrs_0.6.5 tools_4.3.2
#> [9] generics_0.1.3 parallel_4.3.2 tibble_3.2.1 fansi_1.0.6
#> [13] pkgconfig_2.0.3 Matrix_1.6-5 desc_1.4.3 ggridges_0.5.5
#> [17] rngtools_1.5.2 RcppParallel_5.1.6 lifecycle_1.0.4 compiler_4.3.2
#> [21] stringr_1.5.1 textshaping_0.3.7 munsell_0.5.0 codetools_0.2-19
#> [25] carData_3.0-5 htmltools_0.5.7 sass_0.4.8 yaml_2.3.8
#> [29] pillar_1.9.0 pkgdown_2.0.7 car_3.1-2 ggpubr_0.6.0
#> [33] jquerylib_0.1.4 tidyr_1.3.0 cachem_1.0.8 doRNG_1.8.6
#> [37] iterators_1.0.14 abind_1.4-5 foreach_1.5.2 tidyselect_1.2.0
#> [41] digest_0.6.34 stringi_1.8.3 reshape2_1.4.4 purrr_1.0.2
#> [45] fastmap_1.1.1 grid_4.3.2 colorspace_2.1-0 cli_3.6.2
#> [49] magrittr_2.0.3 Rfast_2.1.0 utf8_1.2.4 broom_1.0.5
#> [53] scales_1.3.0 backports_1.4.1 RcppZiggurat_0.1.6 rmarkdown_2.25
#> [57] ggsignif_0.6.4 ragg_1.2.7 memoise_2.0.1 evaluate_0.23
#> [61] knitr_1.45 doParallel_1.0.17 rlang_1.1.3 Rcpp_1.0.12
#> [65] glue_1.7.0 jsonlite_1.8.8 R6_2.5.1 plyr_1.8.9
#> [69] systemfonts_1.0.5 fs_1.6.3