TCGAretriever is an R library aimed at downloading genomic data from cBioPortal (https://www.cbioportal.org/), including The Cancer Genome Atlas (TCGA) data (free-tier data). TCGA is a program aimed at improving our understanding of Cancer Biology. Several TCGA Datasets are available online. TCGAretriever helps accessing and downloading TCGA data in R via the cBioPortal API. Features of TCGAretriever are:
it is simple and reliable
it is tailored for downloading large volumes of data
Below, you can find a brief tutorial that descibes a simple use case to help ou getting started with TCGAretriever.
library(TCGAretriever)
# Obtain a list of cancer studies from cBio
all_studies <- get_cancer_studies()
# Find published TCGA datasets
keep <- grepl("tcga_pub$", all_studies[,1])
tcga_studies <- all_studies[keep, ]
# Show results
head(tcga_studies[, 1:2])
## cancer_study_id name
## 6 laml_tcga_pub Acute Myeloid Leukemia (TCGA, NEJM 2013)
## 20 sarc_tcga_pub Adult Soft Tissue Sarcomas (TCGA, Cell 2017)
## 30 blca_tcga_pub Bladder Urothelial Carcinoma (TCGA, Nature 2014)
## 45 brca_tcga_pub Breast Invasive Carcinoma (TCGA, Nature 2012)
## 66 coadread_tcga_pub Colorectal Adenocarcinoma (TCGA, Nature 2012)
## 83 stes_tcga_pub Esophageal Carcinoma (TCGA, Nature 2017)
# Define the cancer study id: brca_tcga_pub
my_csid <- "brca_tcga_pub"
# Obtain genetic profiles
blca_pro <- get_genetic_profiles(csid = my_csid)
head(blca_pro[, 1:2], n = 8)
## genetic_profile_id
## 1 brca_tcga_pub_rppa
## 2 brca_tcga_pub_rppa_Zscores
## 3 brca_tcga_pub_gistic
## 4 brca_tcga_pub_mrna
## 5 brca_tcga_pub_mrna_median_Zscores
## 6 brca_tcga_pub_linear_CNA
## 7 brca_tcga_pub_methylation_hm27
## 8 brca_tcga_pub_mutations
## genetic_profile_name
## 1 Protein expression (RPPA)
## 2 Protein expression Z-scores (RPPA)
## 3 Putative copy-number alterations from GISTIC
## 4 mRNA expression (microarray)
## 5 mRNA Expression Z-scores (microarray)
## 6 Relative linear copy-number values
## 7 Methylation (HM27)
## 8 Mutations
# Obtain cases
blca_cas <- get_case_lists(csid = my_csid)
head(blca_cas[, 1:2])
## case_list_id case_list_name
## 1 brca_tcga_pub_all All samples
## 2 brca_tcga_pub_complete Complete samples
## 3 brca_tcga_pub_luminal Luminal (A/B)
## 4 brca_tcga_pub_basal PAM50 Basal
## 5 brca_tcga_pub_claudin PAM50 Claudin low
## 6 brca_tcga_pub_her2 PAM50 Her2 enriched
# Define a set of genes of interest
q_genes <- c("TP53", "MDM2", "E2F1", "EZH2")
q_cases <- "brca_tcga_pub_complete"
rna_prf <- "brca_tcga_pub_mrna"
mut_prf <- "brca_tcga_pub_mutations"
# Download RNA
brca_RNA <- TCGAretriever::get_profile_data(case_id = q_cases, gprofile_id = rna_prf, glist = q_genes)
NOTE: the resuting data.frame includes ENTREZ_GENE_IDs and OFFICIAL_SMBOLs as first and second column.
head(brca_RNA[, 1:5])
## GENE_ID COMMON TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01
## 1 1869 E2F1 -1.69975 -0.45025 0.23825
## 2 2146 EZH2 -1.672866667 -0.894266667 0.528933333
## 3 4193 MDM2 -0.355583333 -0.24975 -0.656166667
## 4 7157 TP53 -0.388833333 -1.8085 1.671666667
# Set SYMBOLs as rownames
# Note that you may prefer to use the tibble package for this
rownames(brca_RNA) <- brca_RNA$COMMON
brca_RNA <- brca_RNA[, -c(1,2)]
# Download mutations (simple)
brca_MUT <- TCGAretriever::get_profile_data(case_id = q_cases, gprofile_id = mut_prf, glist = q_genes)
rownames(brca_MUT) <- brca_MUT$COMMON
brca_MUT <- brca_MUT[, -c(1,2)]
# Show results
brca_RNA[,1:6]
## TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01 TCGA-A2-A0CM-01
## E2F1 -1.69975 -0.45025 0.23825 -0.56825
## EZH2 -1.672866667 -0.894266667 0.528933333 -0.426866667
## MDM2 -0.355583333 -0.24975 -0.656166667 -0.064916667
## TP53 -0.388833333 -1.8085 1.671666667 0.490333333
## TCGA-AR-A1AR-01 TCGA-B6-A0WX-01
## E2F1 -1.255 -1.49
## EZH2 -1.118133333 -1.6076
## MDM2 -1.032583333 0.662666667
## TP53 -0.8395 0.851
brca_MUT[,1:6]
## TCGA-A2-A0T2-01 TCGA-A2-A04P-01 TCGA-A1-A0SK-01 TCGA-A2-A0CM-01
## E2F1 NaN NaN NaN NaN
## EZH2 NaN NaN NaN NaN
## MDM2 NaN NaN NaN NaN
## TP53 Y220C G108Vfs*15 M133K E204Sfs*43
## TCGA-AR-A1AR-01 TCGA-B6-A0WX-01
## E2F1 NaN NaN
## EZH2 NaN NaN
## MDM2 NaN NaN
## TP53 NaN R248W,L114*
NOTE: when using the same case_list_id
to retrieve different types of data (genetic profiles) results have consistent structure. In other words, data.frames include info for the same list of cases (and hence, the resulting data.frames have the same number of columns, and identical column names).
# Note that the columns (cases) are identical
# and have the same order in both data.frames
sum(colnames(brca_MUT) != colnames(brca_RNA))
## [1] 0
Relationship between E2F1 and EZH2 in BRCA
# Coerce to data.frame with numeric features
df <- data.frame(t(brca_RNA), stringsAsFactors = FALSE)
for(i in 1:ncol(df)) { df[, i] <- as.numeric(df[, i])}
# Visualize the correlation between EZH2 and E2F1
with(df,
plot(E2F1, EZH2,
pch = 19, cex = 0.5, main = "E2F1-EZH2 correlation in BRCA"))
Relationship between MDM2 and P53, by P53 mutation status in BRCA
# Coerce to data.frame with numeric features
df <- data.frame(t(brca_RNA), stringsAsFactors = FALSE)
for(i in 1:ncol(df)) { df[, i] <- as.numeric(df[, i])}
df$TP53.status <- as.factor(ifelse(brca_MUT["TP53",] == "NaN", "WT", "MUT"))
# Split data based on TP53.status
lst <- split(df, f = df$TP53.status)
# Visualize the correlation between MDM2 and TP53 by P53 mutation status
par(mfrow = c(1, 2))
for(x in names(lst)) {
with(lst[[x]],
plot(TP53, MDM2,
pch = 19, cex = 0.5,
xlim = c(-2.6, 2.2), ylim = c(-1.6, 3.6),
main = paste0("MDM2-vs-P53 in ", x , " P53 tumors")))
}