In this vignette I show comparisons between
namedCapture::df_match_variable
and its closest cousin in the R
package universe, tidyr::extract
. The two packages can be used to
compute the same result, but the code/syntax is different.
In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:
namedCapture::df_match_variable
.convert
argument of tidyr::extract
, which uses
utils::type.convert
. Because type.convert
does not know how to
convert strings like 111,000
to integer, we first need to use
remove.commas
to create a new data.frame to use as input to
tidyr::extract
. In contrast namedCapture supports arbitrary
group-specific type conversion functions; we specify to.int
on the
same line as the corresponding name/pattern for the
chromStart/chromEnd groups.## First define data.
(sacct.df <- data.frame(
position=c(
"chr10:213,054,000-213,055,000",
"chrM:111,000-222,000",
"this will not match",
NA, # neither will this.
"chr1:110-111 chr2:220-222"), # two possible matches.
JobID=c(
"13937810_25",
"13937810_25.batch",
"13937810_25.extern",
"14022192_[1-3]",
"14022204_[4]"),
stringsAsFactors=FALSE))
#> position JobID
#> 1 chr10:213,054,000-213,055,000 13937810_25
#> 2 chrM:111,000-222,000 13937810_25.batch
#> 3 this will not match 13937810_25.extern
#> 4 <NA> 14022192_[1-3]
#> 5 chr1:110-111 chr2:220-222 14022204_[4]
remove.commas <- function(x)gsub(",", "", x)
long.list <- list()
## namedCapture: 29 lines of code.
range.list <- list(
"\\[",
task1="[0-9]+", as.integer,
"(?:-",#begin optional end of range.
taskN="[0-9]+", as.integer,
")?", #end is optional.
"\\]")
task.list <- list(
"(?:",#begin alternate
task="[0-9]+", as.integer,
"|",#either one task(above) or range(below)
range.list,
")")#end alternate
to.int <- function(x)as.integer(remove.commas(x))
(long.list$namedCapture <- namedCapture::df_match_variable(
sacct.df,
JobID=list(
job="[0-9]+", as.integer,
"_",
task.list,
"(?:[.]",
type=".*",
")?"),
position=list(
chrom="chr.*?",
":",
chromStart=".*?", to.int,
"-",
chromEnd="[0-9,]*", to.int)))
#> position JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192 NA
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204 NA
#> JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1 NA NA chr10 213054000
#> 2 NA NA batch chrM 111000
#> 3 NA NA extern <NA> NA
#> 4 1 3 <NA> NA
#> 5 4 NA chr1 110
#> position.chromEnd
#> 1 213055000
#> 2 222000
#> 3 NA
#> 4 NA
#> 5 111
## tidyr: 46 lines of code.
range.vec <- c(
"\\[",
task1="[0-9]+",
"(?:-",#begin optional end of range.
taskN="[0-9]+",
")?", #end is optional.
"\\]")
task.vec <- c(
"(?:",#begin alternate
task="[0-9]+",
"|",#either one task(above) or range(below)
range.vec,
")")#end alternate
regex.list <- list(
JobID=c(
job="[0-9]+",
"_",
task.vec,
"(?:[.]",
type=".*",
")?"),
position=c(
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*"))
tidyr.input <- transform(
sacct.df,
position=remove.commas(position))
tidyr.df.list <- list(sacct.df)
for(col.name in names(regex.list)){
regex.vec <- regex.list[[col.name]]
is.group <- names(regex.vec)!=""
format.vec <- ifelse(is.group, "(%s)", "%s")
group.vec <- sprintf(format.vec, regex.vec)
regex <- paste(group.vec, collapse="")
group.names <- names(regex.vec)[is.group]
result <- tidyr::extract(
tidyr.input, col.name, group.names, regex, convert=TRUE)
to.save <- result[, group.names, drop=FALSE]
names(to.save) <- paste0(col.name, ".", group.names)
tidyr.df.list[[col.name]] <- to.save
}
names(tidyr.df.list) <- NULL
long.list$tidyr <- do.call(cbind, tidyr.df.list)
## Make sure the results are the same.
t(sapply(long.list, names))
#> [,1] [,2] [,3] [,4] [,5]
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> tidyr "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> [,6] [,7] [,8] [,9]
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> tidyr "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> [,10]
#> namedCapture "position.chromEnd"
#> tidyr "position.chromEnd"
t(sapply(long.list, sapply, class))
#> position JobID JobID.job JobID.task JobID.task1
#> namedCapture "character" "character" "integer" "integer" "integer"
#> tidyr "character" "character" "integer" "integer" "integer"
#> JobID.taskN JobID.type position.chrom position.chromStart
#> namedCapture "integer" "character" "character" "integer"
#> tidyr "integer" "character" "character" "integer"
#> position.chromEnd
#> namedCapture "integer"
#> tidyr "integer"
long.list$tidyr$JobID.type <- ifelse(
is.na(long.list$tidyr$JobID.type),
"",
long.list$tidyr$JobID.type)
with(long.list, identical(tidyr, namedCapture))
#> [1] TRUE
Exercise for the reader use rematch2::bind_re_match
instead of
tidyr::extract
(you should only have to change a few lines of code
in the for loop).
This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.
## First define data.
(sacct.df <- data.frame(
position=c(
"chr10:213,054,000-213,055,000",
"chrM:111,000-222,000",
"this will not match",
NA, # neither will this.
"chr1:110-111 chr2:220-222"), # two possible matches.
JobID=c(
"13937810_25",
"13937810_25.batch",
"13937810_25.extern",
"14022192_[1-3]",
"14022204_[4]"),
stringsAsFactors=FALSE))
#> position JobID
#> 1 chr10:213,054,000-213,055,000 13937810_25
#> 2 chrM:111,000-222,000 13937810_25.batch
#> 3 this will not match 13937810_25.extern
#> 4 <NA> 14022192_[1-3]
#> 5 chr1:110-111 chr2:220-222 14022204_[4]
short.list <- list()
## tidyr alternate (13 lines total)
e <- function(col.name, group.names, pattern){
result <- tidyr::extract(
sacct.df, col.name, group.names, pattern, convert=TRUE)
to.save <- result[, group.names, drop=FALSE]
names(to.save) <- paste0(col.name, ".", group.names)
to.save
}
short.list$tidyr <- do.call(cbind, list(
sacct.df,
e("JobID", c("job", "task", "task1", "taskN", "type"),
"([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"),
e("position", c("chrom", "chromStart", "chromEnd"),
"(chr.*?):(.*?)-([0-9,]*)")))
## namedCapture alternate (7 lines total)
(short.list$namedCapture <- namedCapture::df_match_variable(
sacct.df,
JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?",
position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)"))
#> position JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204
#> JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1 chr10 213,054,000
#> 2 batch chrM 111,000
#> 3 extern <NA> <NA>
#> 4 1 3 <NA> <NA>
#> 5 4 chr1 110
#> position.chromEnd
#> 1 213,055,000
#> 2 222,000
#> 3 <NA>
#> 4 <NA>
#> 5 111
for(N in names(short.list$namedCapture)){
short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE)
}
## Make sure the results are the same.
t(sapply(short.list, names))
#> [,1] [,2] [,3] [,4] [,5]
#> tidyr "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> [,6] [,7] [,8] [,9]
#> tidyr "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> [,10]
#> tidyr "position.chromEnd"
#> namedCapture "position.chromEnd"
t(sapply(short.list, sapply, class))
#> position JobID JobID.job JobID.task JobID.task1
#> tidyr "character" "character" "integer" "integer" "integer"
#> namedCapture "character" "character" "integer" "integer" "integer"
#> JobID.taskN JobID.type position.chrom position.chromStart
#> tidyr "integer" "character" "character" "character"
#> namedCapture "integer" "character" "character" "character"
#> position.chromEnd
#> tidyr "character"
#> namedCapture "character"
short.list$tidyr$JobID.type <- ifelse(
is.na(short.list$tidyr$JobID.type),
"",
short.list$tidyr$JobID.type)
with(short.list, identical(tidyr, namedCapture))
#> [1] TRUE
rematch2::bind_re_match
is similar to tidyr::extract
but
additionally supports named capture regular expressions. Overall the
comparison shows that both packages can use a relatively verbose and
readable syntax to define complex regular expressions piece by piece:
range.list <- list(
"\\[",
task1="[0-9]+", as.integer,
list(
"-",#begin optional end of range.
taskN="[0-9]+", as.integer
), "?", #end is optional.
"\\]")
namedCapture::df_match_variable(sacct.df, JobID=range.list)
#> position JobID JobID.task1 JobID.taskN
#> 1 chr10:213,054,000-213,055,000 13937810_25 NA NA
#> 2 chrM:111,000-222,000 13937810_25.batch NA NA
#> 3 this will not match 13937810_25.extern NA NA
#> 4 <NA> 14022192_[1-3] 1 3
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 4 NA
range.pat <- paste0(
"\\[",
"(?<task1>[0-9]+)",
"(?:",
"-",#begin optional end of range.
"(?<taskN>[0-9]+)",
")?", #end is optional.
"\\]")
rematch2::bind_re_match(sacct.df, JobID, range.pat)
#> position JobID task1 taskN
#> 1 chr10:213,054,000-213,055,000 13937810_25 <NA> <NA>
#> 2 chrM:111,000-222,000 13937810_25.batch <NA> <NA>
#> 3 this will not match 13937810_25.extern <NA> <NA>
#> 4 <NA> 14022192_[1-3] 1 3
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 4
task.list <- list(
"_",
list(
task="[0-9]+", as.integer,
"|",#either one task(above) or range(below)
range.list))
namedCapture::df_match_variable(sacct.df, JobID=task.list)
#> position JobID JobID.task JobID.task1
#> 1 chr10:213,054,000-213,055,000 13937810_25 25 NA
#> 2 chrM:111,000-222,000 13937810_25.batch 25 NA
#> 3 this will not match 13937810_25.extern 25 NA
#> 4 <NA> 14022192_[1-3] NA 1
#> 5 chr1:110-111 chr2:220-222 14022204_[4] NA 4
#> JobID.taskN
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 3
#> 5 NA
task.pat <- paste0(
"_",
"(?:",
"(?<task>[0-9]+)",
"|", #either one task(above) or range(below)
range.pat,
")")
rematch2::bind_re_match(sacct.df, JobID, task.pat)
#> position JobID task task1 taskN
#> 1 chr10:213,054,000-213,055,000 13937810_25 25
#> 2 chrM:111,000-222,000 13937810_25.batch 25
#> 3 this will not match 13937810_25.extern 25
#> 4 <NA> 14022192_[1-3] 1 3
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 4
job.list <- list(
job="[0-9]+", as.integer,
task.list,
list(
"[.]",
type=".*"
), "?")
(job.namedCapture <- namedCapture::df_match_variable(sacct.df, JobID=job.list))
#> position JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192 NA
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204 NA
#> JobID.task1 JobID.taskN JobID.type
#> 1 NA NA
#> 2 NA NA batch
#> 3 NA NA extern
#> 4 1 3
#> 5 4 NA
job.pat <- paste0(
"(?<job>[0-9]+)",
task.pat,
"(?:",
"[.]",
"(?<type>.*)",
")?")
(job.rematch2 <- rematch2::bind_re_match(sacct.df, JobID, job.pat))
#> position JobID job task task1 taskN
#> 1 chr10:213,054,000-213,055,000 13937810_25 13937810 25
#> 2 chrM:111,000-222,000 13937810_25.batch 13937810 25
#> 3 this will not match 13937810_25.extern 13937810 25
#> 4 <NA> 14022192_[1-3] 14022192 1 3
#> 5 chr1:110-111 chr2:220-222 14022204_[4] 14022204 4
#> type
#> 1
#> 2 batch
#> 3 extern
#> 4
#> 5
pos.namedCapture <- namedCapture::df_match_variable(
job.namedCapture, position=list(
chrom="chr.*?",
":",
chromStart=".*?", to.int,
"-",
chromEnd="[0-9,]*", to.int))
str(pos.namedCapture)
#> 'data.frame': 5 obs. of 10 variables:
#> $ position : chr "chr10:213,054,000-213,055,000" "chrM:111,000-222,000" "this will not match" NA ...
#> $ JobID : chr "13937810_25" "13937810_25.batch" "13937810_25.extern" "14022192_[1-3]" ...
#> $ JobID.job : int 13937810 13937810 13937810 14022192 14022204
#> $ JobID.task : int 25 25 25 NA NA
#> $ JobID.task1 : int NA NA NA 1 4
#> $ JobID.taskN : int NA NA NA 3 NA
#> $ JobID.type : chr "" "batch" "extern" "" ...
#> $ position.chrom : chr "chr10" "chrM" NA NA ...
#> $ position.chromStart: int 213054000 111000 NA NA 110
#> $ position.chromEnd : int 213055000 222000 NA NA 111
pos.rematch2 <- rematch2::bind_re_match(
job.rematch2, position, paste0(
"(?<chrom>chr.*?)",
":",
"(?<chromStart>.*?)",
"-",
"(?<chromEnd>[0-9,]*)"))
str(pos.rematch2)
#> 'data.frame': 5 obs. of 10 variables:
#> $ position : chr "chr10:213,054,000-213,055,000" "chrM:111,000-222,000" "this will not match" NA ...
#> $ JobID : chr "13937810_25" "13937810_25.batch" "13937810_25.extern" "14022192_[1-3]" ...
#> $ job : chr "13937810" "13937810" "13937810" "14022192" ...
#> $ task : chr "25" "25" "25" "" ...
#> $ task1 : chr "" "" "" "1" ...
#> $ taskN : chr "" "" "" "3" ...
#> $ type : chr "" "batch" "extern" "" ...
#> $ chrom : chr "chr10" "chrM" NA NA ...
#> $ chromStart: chr "213,054,000" "111,000" NA NA ...
#> $ chromEnd : chr "213,055,000" "222,000" NA NA ...
The main difference in syntax is that group names are specified in
the regular expression string literal for rematch2, whereas group
names are specified as R argument names for namedCapture
A difference in the result is that all columns of pos.rematch2
are
character, whereas some columns of pos.namedCapture
have already
been converted to integer. Using rematch2
type conversion may be
accomplished as a post-processing step:
converted.rematch2 <- transform(
pos.rematch2,
JobID.job=to.int(job),
JobID.task1=to.int(task1),
JobID.taskN=to.int(taskN),
JobID.task=to.int(task),
JobID.type=type,
position.chrom=chrom,
position.chromStart=to.int(chromStart),
position.chromEnd=to.int(chromEnd),
stringsAsFactors=FALSE)
some.rematch2 <- converted.rematch2[, names(pos.namedCapture)]
identical(some.rematch2, pos.namedCapture)
#> [1] TRUE
Exercise for the reader: convert all the rematch2::bind_re_match
calls in this section to tidyr::extract
calls.