Getting started with TCGAretriever

TCGAretriever is an R library aimed at downloading genomic data from cBioPortal (https://www.cbioportal.org/), including The Cancer Genome Atlas (TCGA) data (free-tier data). TCGA is a program aimed at improving our understanding of Cancer Biology. Several TCGA Datasets are available online. TCGAretriever helps accessing and downloading TCGA data in R via the cBioPortal API. Features of TCGAretriever are:

it is simple and reliable
it is tailored for downloading large volumes of data

Below, you can find a brief tutorial that descibes a simple use case to help ou getting started with TCGAretriever.

Cancer Study IDs

library(TCGAretriever)

# Obtain a list of cancer studies from cBio
all_studies <- get_cancer_studies()

# Find published TCGA datasets
keep <- grepl("tcga_pub$", all_studies[,1])
tcga_studies <- all_studies[keep, ]

# Show results
head(tcga_studies[, 1:2])

##      cancer_study_id                                             name
## 6      laml_tcga_pub         Acute Myeloid Leukemia (TCGA, NEJM 2013)
## 20     sarc_tcga_pub     Adult Soft Tissue Sarcomas (TCGA, Cell 2017)
## 30     blca_tcga_pub Bladder Urothelial Carcinoma (TCGA, Nature 2014)
## 45     brca_tcga_pub    Breast Invasive Carcinoma (TCGA, Nature 2012)
## 66 coadread_tcga_pub    Colorectal Adenocarcinoma (TCGA, Nature 2012)
## 83     stes_tcga_pub         Esophageal Carcinoma (TCGA, Nature 2017)

# Define the cancer study id: brca_tcga_pub
my_csid <- "brca_tcga_pub"

Genetic Profiles (Assays) and Case Lists

# Obtain genetic profiles
blca_pro <- get_genetic_profiles(csid = my_csid)
head(blca_pro[, 1:2], n = 8)

##                  genetic_profile_id
## 1                brca_tcga_pub_rppa
## 2        brca_tcga_pub_rppa_Zscores
## 3              brca_tcga_pub_gistic
## 4                brca_tcga_pub_mrna
## 5 brca_tcga_pub_mrna_median_Zscores
## 6          brca_tcga_pub_linear_CNA
## 7    brca_tcga_pub_methylation_hm27
## 8           brca_tcga_pub_mutations
##                           genetic_profile_name
## 1                    Protein expression (RPPA)
## 2           Protein expression Z-scores (RPPA)
## 3 Putative copy-number alterations from GISTIC
## 4                 mRNA expression (microarray)
## 5        mRNA Expression Z-scores (microarray)
## 6           Relative linear copy-number values
## 7                           Methylation (HM27)
## 8                                    Mutations

# Obtain cases 
blca_cas <- get_case_lists(csid = my_csid)
head(blca_cas[, 1:2])

##             case_list_id      case_list_name
## 1      brca_tcga_pub_all         All samples
## 2 brca_tcga_pub_complete    Complete samples
## 3  brca_tcga_pub_luminal       Luminal (A/B)
## 4    brca_tcga_pub_basal         PAM50 Basal
## 5  brca_tcga_pub_claudin   PAM50 Claudin low
## 6     brca_tcga_pub_her2 PAM50 Her2 enriched

Retrieve Genomic Data

# Define a set of genes of interest
q_genes <- c("TP53", "MDM2", "E2F1", "EZH2")
q_cases <- "brca_tcga_pub_complete"
rna_prf <- "brca_tcga_pub_mrna"
mut_prf <- "brca_tcga_pub_mutations"

# Download RNA
brca_RNA <- TCGAretriever::get_profile_data(case_id = q_cases, gprofile_id = rna_prf, glist = q_genes)

NOTE: the resuting data.frame includes ENTREZ_GENE_IDs and OFFICIAL_SMBOLs as first and second column.

head(brca_RNA[, 1:5])

##   GENE_ID COMMON TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01
## 1    1869   E2F1        -1.69975        -0.45025         0.23825
## 2    2146   EZH2    -1.672866667    -0.894266667     0.528933333
## 3    4193   MDM2    -0.355583333        -0.24975    -0.656166667
## 4    7157   TP53    -0.388833333         -1.8085     1.671666667

# Set SYMBOLs as rownames
# Note that you may prefer to use the tibble package for this
rownames(brca_RNA) <- brca_RNA$COMMON
brca_RNA <- brca_RNA[, -c(1,2)]

# Download mutations (simple)
brca_MUT <- TCGAretriever::get_profile_data(case_id = q_cases, gprofile_id = mut_prf, glist = q_genes)
rownames(brca_MUT) <- brca_MUT$COMMON
brca_MUT <- brca_MUT[, -c(1,2)]

# Show results
brca_RNA[,1:6]

##      TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01 TCGA-A2-A0CM-01
## E2F1        -1.69975        -0.45025         0.23825        -0.56825
## EZH2    -1.672866667    -0.894266667     0.528933333    -0.426866667
## MDM2    -0.355583333        -0.24975    -0.656166667    -0.064916667
## TP53    -0.388833333         -1.8085     1.671666667     0.490333333
##      TCGA-AR-A1AR-01 TCGA-B6-A0WX-01
## E2F1          -1.255           -1.49
## EZH2    -1.118133333         -1.6076
## MDM2    -1.032583333     0.662666667
## TP53         -0.8395           0.851

brca_MUT[,1:6]

##      TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01 TCGA-A2-A0CM-01
## E2F1             NaN             NaN             NaN             NaN
## EZH2             NaN             NaN             NaN             NaN
## MDM2             NaN             NaN             NaN             NaN
## TP53           Y220C      G108Vfs*15           M133K      E204Sfs*43
##      TCGA-AR-A1AR-01 TCGA-B6-A0WX-01
## E2F1             NaN             NaN
## EZH2             NaN             NaN
## MDM2             NaN             NaN
## TP53             NaN     R248W,L114*

NOTE: when using the same case_list_id to retrieve different types of data (genetic profiles) results have consistent structure. In other words, data.frames include info for the same list of cases (and hence, the resulting data.frames have the same number of columns, and identical column names).

# Note that the columns (cases) are identical 
# and have the same order in both data.frames
sum(colnames(brca_MUT) != colnames(brca_RNA))

## [1] 0

Examples and visualizations

Relationship between E2F1 and EZH2 in BRCA

# Coerce to data.frame with numeric features 
df <- data.frame(t(brca_RNA), stringsAsFactors = FALSE)
for(i in 1:ncol(df)) { df[, i] <- as.numeric(df[, i])}

# Visualize the correlation between EZH2 and E2F1
with(df, 
     plot(E2F1, EZH2, 
          pch = 19, cex = 0.5, main = "E2F1-EZH2 correlation in BRCA"))

Relationship between MDM2 and P53, by P53 mutation status in BRCA

# Coerce to data.frame with numeric features 
df <- data.frame(t(brca_RNA), stringsAsFactors = FALSE)
for(i in 1:ncol(df)) { df[, i] <- as.numeric(df[, i])}
df$TP53.status <- as.factor(ifelse(brca_MUT["TP53",] == "NaN", "WT", "MUT"))

# Split data based on TP53.status 
lst <- split(df, f = df$TP53.status)

# Visualize the correlation between MDM2 and TP53 by P53 mutation status
par(mfrow = c(1, 2))
for(x in names(lst)) {
  with(lst[[x]], 
       plot(TP53, MDM2, 
            pch = 19, cex = 0.5, 
            xlim = c(-2.6, 2.2), ylim = c(-1.6, 3.6),
            main = paste0("MDM2-vs-P53 in ", x , " P53 tumors")))
}

Getting started with TCGAretriever

Damiano Fantini

December 17, 2019

Cancer Study IDs

Genetic Profiles (Assays) and Case Lists

Retrieve Genomic Data

Examples and visualizations