We have implemented a highly optimized version of the iq pipeline (Pham et al., Bioinformatics 2020). To run the following examples, download DIA-report-long-format.zip and unzip the file to a local working directory.
The unzipped file DIA-report-long-format.txt
is a tab-deliminated text file exported from a Spectronaut search using this export schema iq.rs. The user might want to import this schema to their Spectronaut installation for the ease of using the iq pipeline.
First, we apply the standard pipeline implemented in pure R. Read and filter the data
library("iq") # if not already installed, run install.packages("iq")
raw <- read.delim("DIA-Report-long-format.txt")
selected <- raw$F.ExcludedFromQuantification == "False" &
!is.na(raw$PG.Qvalue) & (raw$PG.Qvalue < 0.01) &
!is.na(raw$EG.Qvalue) & (raw$EG.Qvalue < 0.01)
raw <- raw[selected,]
Normalize the data, create a protein list, and perform the MaxLFQ algorithm
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
norm_data <- iq::preprocess(raw,
sample_id = sample_id,
secondary_id = secondary_id)
#> Concatenating secondary ids...
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
protein_list <- iq::create_protein_list(norm_data)
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
result <- iq::create_protein_table(protein_list)
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
Extract annotation columns and write the result to an output file
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
extra_names <- iq::extract_annotation(rownames(result$estimate),
raw,
annotation_columns = annotation_columns)
write.table(cbind(Protein = rownames(result$estimate),
extra_names[, annotation_columns],
MaxLFQ_annotation = result$annotation,
result$estimate),
"iq-MaxLFQ.txt", sep = "\t", row.names = FALSE)
The resulting file iq-MaxLFQ.txt
is the protein level quantification report in a tab-deliminated text format.
The function iq::fast_MaxLFQ
implemented in C++ combines the functionalities of iq::create_protein_list
and iq::create_protein_table
.
#--------------------- Replacing ---------------------
# protein_list <- iq::create_protein_list(norm_data) #
# result <- iq::create_protein_table(protein_list) #
#-----------------------------------------------------
result_faster <- iq::fast_MaxLFQ(norm_data)
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 10%
#> 15%
#> 21%
#> 26%
#> 32%
#> 38%
#> 43%
#> 48%
#> 54%
#> 59%
#> 64%
#> 69%
#> 74%
#> 80%
#> 85%
#> 90%
#> 95%
#> Completed.
The results of the R implementation result
and C++ implementation result_faster
should be equal up to the floating-point precision of the underlying numerical libraries.
cat("Max difference =", max(abs(result_faster$estimate - result$estimate), na.rm = TRUE), "\n")
#> Max difference = 1.207923e-13
cat("Identical NAs =", identical(is.na(result_faster$estimate), is.na(result$estimate)), "\n")
#> Identical NAs = TRUE
cat("Equal annotation =", identical(result_faster$annotation, result$annotation), "\n")
#> Equal annotation = TRUE
We can check the improvement in execution time. The following result is obtained on a computer with 12 CPU cores.
system.time({
protein_list <- iq::create_protein_list(norm_data)
result <- iq::create_protein_table(protein_list)
})
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> user system elapsed
#> 581.72 17.54 599.63
system.time({
result_faster <- iq::fast_MaxLFQ(norm_data)
})
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 6%
#> 11%
#> 16%
#> 21%
#> 26%
#> 31%
#> 37%
#> 44%
#> 49%
#> 54%
#> 59%
#> 64%
#> 69%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> Completed.
#> user system elapsed
#> 3.65 0.03 1.50
We have implemented a fast data loading algorithm and an efficient data structure. The memory usage is highly optimized to enable the processing of very large datasets.
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
iq_dat <- iq::fast_read("DIA-report-long-format.txt",
sample_id = sample_id,
secondary_id = secondary_id,
filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
annotation_col = annotation_columns)
#>
#> Command: --sample R.FileName --primary PG.ProteinGroups --secondary EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType --quant F.PeakArea --annotation PG.Genes PG.ProteinNames --filter-string-equal F.ExcludedFromQuantification False --filter-double-less PG.Qvalue 0.01 --filter-double-less EG.Qvalue 0.01 DIA-report-long-format.txt
#>
#> Sample column:
#> R.FileName
#> Protein column:
#> PG.ProteinGroups
#> Ion column(s):
#> EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType
#> Quant column:
#> F.PeakArea
#> Annotation column(s):
#> PG.Genes PG.ProteinNames
#> String equal filter(s):
#> F.ExcludedFromQuantification == False
#> Double less filter(s):
#> PG.Qvalue < 0.010000
#> EG.Qvalue < 0.010000
#>
#> Using 4 threads ...
#> 20 samples read
#>
#> # lines read (excluding headers) = 5547331
#> # quantitative values after filtering = 3390569
#>
#> # samples = 24
#> # proteins = 3554
iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
result_fastest <- iq::fast_MaxLFQ(iq_norm_data,
row_names = iq_dat$protein[, 1],
col_names = iq_dat$sample)
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 11%
#> 17%
#> 23%
#> 28%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 76%
#> 82%
#> 87%
#> 93%
#> 98%
#> Completed.
The result of the optimized pipeline result_fastest
should be the same as that of the standard pipeline result
.
cat("Max difference =", max(abs(result_fastest$estimate - result$estimate), na.rm = TRUE), "\n")
#> Max difference = 1.207923e-13
cat("Identical NAs =", identical(is.na(result_fastest$estimate), is.na(result$estimate)), "\n")
#> Identical NAs = TRUE
cat("Equal annotation =", identical(result_fastest$annotation, result$annotation), "\n")
#> Equal annotation = TRUE
The annotation columns are stored in the protein
component of the input data structure iq_dat
. We can extract the annotation columns and write the result to an output text file.
iq_extra_names <- iq::extract_annotation(rownames(result_fastest$estimate),
iq_dat$protein,
annotation_columns = annotation_columns)
write.table(cbind(Protein = rownames(result_fastest$estimate),
iq_extra_names[, annotation_columns],
MaxLFQ_annotation = result_fastest$annotation,
result_fastest$estimate),
"iq-MaxLFQ-fast.txt", sep = "\t", row.names = FALSE)
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
system.time({
# reading data
raw <- read.delim("DIA-report-long-format.txt")
# filtering
selected <- raw$F.ExcludedFromQuantification == "False" &
!is.na(raw$PG.Qvalue) & raw$PG.Qvalue < 0.01 &
!is.na(raw$EG.Qvalue) & raw$EG.Qvalue < 0.01
raw <- raw[selected,]
## process
norm_data <- iq::preprocess(raw,
sample_id = sample_id,
secondary_id = secondary_id)
protein_list <- iq::create_protein_list(norm_data)
result <- iq::create_protein_table(protein_list)
})
#> Concatenating secondary ids...
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> user system elapsed
#> 814.57 27.73 848.11
system.time({
iq_dat <- iq::fast_read("DIA-report-long-format.txt",
sample_id = sample_id,
secondary_id = secondary_id,
filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
annotation_col = annotation_columns)
iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)
result_fastest <- iq::fast_MaxLFQ(iq_norm_data,
row_names = iq_dat$protein[, 1],
col_names = iq_dat$sample)
})
#>
#> Command: --sample R.FileName --primary PG.ProteinGroups --secondary EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType --quant F.PeakArea --annotation PG.Genes PG.ProteinNames --filter-string-equal F.ExcludedFromQuantification False --filter-double-less PG.Qvalue 0.01 --filter-double-less EG.Qvalue 0.01 DIA-report-long-format.txt
#>
#> Sample column:
#> R.FileName
#> Protein column:
#> PG.ProteinGroups
#> Ion column(s):
#> EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType
#> Quant column:
#> F.PeakArea
#> Annotation column(s):
#> PG.Genes PG.ProteinNames
#> String equal filter(s):
#> F.ExcludedFromQuantification == False
#> Double less filter(s):
#> PG.Qvalue < 0.010000
#> EG.Qvalue < 0.010000
#>
#> Using 4 threads ...
#> 20 samples read
#>
#> # lines read (excluding headers) = 5547331
#> # quantitative values after filtering = 3390569
#>
#> # samples = 24
#> # proteins = 3554
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 10%
#> 15%
#> 20%
#> 26%
#> 31%
#> 36%
#> 41%
#> 47%
#> 54%
#> 59%
#> 64%
#> 69%
#> 74%
#> 79%
#> 84%
#> 89%
#> 95%
#> Completed.
#> user system elapsed
#> 33.78 1.80 18.38