Fast MaxLFQ

The standard pipeline

First, we apply the standard pipeline implemented in pure R. Read and filter the data

library("iq") # if not already installed, run install.packages("iq") 

raw <- read.delim("DIA-Report-long-format.txt")

selected <- raw$F.ExcludedFromQuantification == "False" & 
            !is.na(raw$PG.Qvalue) & (raw$PG.Qvalue < 0.01) &
            !is.na(raw$EG.Qvalue) & (raw$EG.Qvalue < 0.01)

raw <- raw[selected,]

Normalize the data, create a protein list, and perform the MaxLFQ algorithm

sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

norm_data <- iq::preprocess(raw, 
                            sample_id  = sample_id, 
                            secondary_id = secondary_id)
#> Concatenating secondary ids...
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...

protein_list <- iq::create_protein_list(norm_data)
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%

result <- iq::create_protein_table(protein_list)
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%

Extract annotation columns and write the result to an output file

annotation_columns <- c("PG.Genes", "PG.ProteinNames")

extra_names <- iq::extract_annotation(rownames(result$estimate), 
                                      raw, 
                                      annotation_columns = annotation_columns)

write.table(cbind(Protein = rownames(result$estimate),
                  extra_names[, annotation_columns],
                  MaxLFQ_annotation = result$annotation,
                  result$estimate),
            "iq-MaxLFQ.txt", sep = "\t", row.names = FALSE)

The resulting file iq-MaxLFQ.txt is the protein level quantification report in a tab-deliminated text format.

A faster MaxLFQ implementation

The function iq::fast_MaxLFQ implemented in C++ combines the functionalities of iq::create_protein_list and iq::create_protein_table.

#--------------------- Replacing ---------------------
# protein_list <- iq::create_protein_list(norm_data) #
# result <- iq::create_protein_table(protein_list)   #
#-----------------------------------------------------

result_faster <- iq::fast_MaxLFQ(norm_data)
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 10%
#> 15%
#> 21%
#> 26%
#> 32%
#> 38%
#> 43%
#> 48%
#> 54%
#> 59%
#> 64%
#> 69%
#> 74%
#> 80%
#> 85%
#> 90%
#> 95%
#> Completed.

The results of the R implementation result and C++ implementation result_faster should be equal up to the floating-point precision of the underlying numerical libraries.

cat("Max difference =", max(abs(result_faster$estimate - result$estimate), na.rm = TRUE), "\n")
#> Max difference = 1.207923e-13

cat("Identical NAs =", identical(is.na(result_faster$estimate), is.na(result$estimate)), "\n")
#> Identical NAs = TRUE

cat("Equal annotation =", identical(result_faster$annotation, result$annotation), "\n")
#> Equal annotation = TRUE

Benchmarking execution time

We can check the improvement in execution time. The following result is obtained on a computer with 12 CPU cores.

system.time({
    protein_list <- iq::create_protein_list(norm_data)
    result <- iq::create_protein_table(protein_list)
})
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#>    user  system elapsed 
#>  581.72   17.54  599.63

system.time({
    result_faster <- iq::fast_MaxLFQ(norm_data)
})
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 6%
#> 11%
#> 16%
#> 21%
#> 26%
#> 31%
#> 37%
#> 44%
#> 49%
#> 54%
#> 59%
#> 64%
#> 69%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> Completed.
#>    user  system elapsed 
#>    3.65    0.03    1.50

An efficient data structure and data loading

We have implemented a fast data loading algorithm and an efficient data structure. The memory usage is highly optimized to enable the processing of very large datasets.

sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

annotation_columns <- c("PG.Genes", "PG.ProteinNames")

iq_dat <- iq::fast_read("DIA-report-long-format.txt",
                        sample_id  = sample_id, 
                        secondary_id = secondary_id,
                        filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
                        annotation_col = annotation_columns)
#> 
#> Command: --sample R.FileName --primary PG.ProteinGroups --secondary EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType --quant F.PeakArea --annotation PG.Genes PG.ProteinNames --filter-string-equal F.ExcludedFromQuantification False --filter-double-less PG.Qvalue 0.01 --filter-double-less EG.Qvalue 0.01 DIA-report-long-format.txt 
#> 
#> Sample column:
#>     R.FileName
#> Protein column:
#>     PG.ProteinGroups
#> Ion column(s):
#>     EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType
#> Quant column:
#>     F.PeakArea
#> Annotation column(s):
#>     PG.Genes PG.ProteinNames
#> String equal filter(s):
#>     F.ExcludedFromQuantification == False
#> Double less filter(s):
#>     PG.Qvalue < 0.010000
#>     EG.Qvalue < 0.010000
#> 
#> Using 4 threads ...
#> 20 samples read
#> 
#> # lines read (excluding headers)      = 5547331
#> # quantitative values after filtering = 3390569
#> 
#> # samples  = 24
#> # proteins = 3554

iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...

result_fastest <- iq::fast_MaxLFQ(iq_norm_data, 
                                  row_names = iq_dat$protein[, 1], 
                                  col_names = iq_dat$sample)
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 11%
#> 17%
#> 23%
#> 28%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 76%
#> 82%
#> 87%
#> 93%
#> 98%
#> Completed.

The result of the optimized pipeline result_fastest should be the same as that of the standard pipeline result.

cat("Max difference =", max(abs(result_fastest$estimate - result$estimate), na.rm = TRUE), "\n")
#> Max difference = 1.207923e-13

cat("Identical NAs =", identical(is.na(result_fastest$estimate), is.na(result$estimate)), "\n")
#> Identical NAs = TRUE

cat("Equal annotation =", identical(result_fastest$annotation, result$annotation), "\n")
#> Equal annotation = TRUE

The annotation columns are stored in the protein component of the input data structure iq_dat. We can extract the annotation columns and write the result to an output text file.

iq_extra_names <- iq::extract_annotation(rownames(result_fastest$estimate), 
                                         iq_dat$protein, 
                                         annotation_columns = annotation_columns)

write.table(cbind(Protein = rownames(result_fastest$estimate),
                  iq_extra_names[, annotation_columns],
                  MaxLFQ_annotation = result_fastest$annotation,
                  result_fastest$estimate), 
            "iq-MaxLFQ-fast.txt", sep = "\t", row.names = FALSE)

Benchmarking execution time

sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

annotation_columns <- c("PG.Genes", "PG.ProteinNames")

system.time({
    
    # reading data
    raw <- read.delim("DIA-report-long-format.txt")

    # filtering
    selected <- raw$F.ExcludedFromQuantification == "False" & 
                !is.na(raw$PG.Qvalue) & raw$PG.Qvalue < 0.01 &
                !is.na(raw$EG.Qvalue) & raw$EG.Qvalue < 0.01

    raw <- raw[selected,]

    ## process

    norm_data <- iq::preprocess(raw, 
                                sample_id  = sample_id, 
                                secondary_id = secondary_id)

    protein_list <- iq::create_protein_list(norm_data)
    
    result <- iq::create_protein_table(protein_list)
    
})
#> Concatenating secondary ids...
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
#> # proteins = 3554, # samples = 24
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#> 5%
#> 10%
#> 15%
#> 20%
#> 25%
#> 30%
#> 35%
#> 40%
#> 45%
#> 50%
#> 55%
#> 60%
#> 65%
#> 70%
#> 75%
#> 80%
#> 85%
#> 90%
#> 95%
#>    user  system elapsed 
#>  814.57   27.73  848.11

system.time({
    iq_dat <- iq::fast_read("DIA-report-long-format.txt",
                            sample_id  = sample_id, 
                            secondary_id = secondary_id,
                            filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
                            annotation_col = annotation_columns)

    iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)

    result_fastest <- iq::fast_MaxLFQ(iq_norm_data, 
                                      row_names = iq_dat$protein[, 1], 
                                      col_names = iq_dat$sample)
})
#> 
#> Command: --sample R.FileName --primary PG.ProteinGroups --secondary EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType --quant F.PeakArea --annotation PG.Genes PG.ProteinNames --filter-string-equal F.ExcludedFromQuantification False --filter-double-less PG.Qvalue 0.01 --filter-double-less EG.Qvalue 0.01 DIA-report-long-format.txt 
#> 
#> Sample column:
#>     R.FileName
#> Protein column:
#>     PG.ProteinGroups
#> Ion column(s):
#>     EG.Library FG.Id FG.Charge F.FrgIon F.Charge F.FrgLossType
#> Quant column:
#>     F.PeakArea
#> Annotation column(s):
#>     PG.Genes PG.ProteinNames
#> String equal filter(s):
#>     F.ExcludedFromQuantification == False
#> Double less filter(s):
#>     PG.Qvalue < 0.010000
#>     EG.Qvalue < 0.010000
#> 
#> Using 4 threads ...
#> 20 samples read
#> 
#> # lines read (excluding headers)      = 5547331
#> # quantitative values after filtering = 3390569
#> 
#> # samples  = 24
#> # proteins = 3554
#> Removing low intensities...
#> Barplotting raw data ...
#> Median normalization ...
#> Barplotting after normalization ...
#> nrow = 3369557, # proteins = 3554, # samples = 24
#> 11 thread(s) available...
#> 0%
#> 5%
#> 10%
#> 15%
#> 20%
#> 26%
#> 31%
#> 36%
#> 41%
#> 47%
#> 54%
#> 59%
#> 64%
#> 69%
#> 74%
#> 79%
#> 84%
#> 89%
#> 95%
#> Completed.
#>    user  system elapsed 
#>   33.78    1.80   18.38

References

Pham TV, Henneman AA, Jimenez CR (2020) iq: an R package to estimate relative protein abundances from ion quantification in DIA-MS-based proteomics. Bioinformatics 36(8):2611-2613.

Fast MaxLFQ

Thang V Pham

2022-04-26

The standard pipeline

A faster MaxLFQ implementation

Benchmarking execution time

An efficient data structure and data loading

Benchmarking execution time

References