Regular expression examples

max.N <- 20
atime.list <- atime::atime(
  N=1:max.N,
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("(a)?", "\\1"), each=N), collapse="")
  },
  ICU=stringi::stri_match(subject, regex=pattern),
  PCRE=regexpr(pattern, subject, perl=TRUE),
  TRE=regexpr(pattern, subject, perl=FALSE)
)
plot(atime.list)

plot of chunk unnamed-chunk-1


if(require(ggplot2)){
  gg <- ggplot()+
    geom_ribbon(aes(
      N, ymin=min, ymax=max, fill=expr.name),
      data=atime.list$meas,
      alpha=0.5)+
    geom_line(aes(
      N, median, color=expr.name),
      data=atime.list$meas)+
    scale_x_log10()+
    scale_y_log10("seconds (median line, min/max band)")
  if(require("directlabels")){
    direct.label(gg, "top.polygons")
  }else{
    gg
  }
}

plot of chunk unnamed-chunk-1

The plots above show that ICU/PCRE/TRE are all exponential in N (subject/pattern size) when the pattern contains backreferences.

atime.list <- atime::atime(
  ICU=stringi::stri_match(subject, regex=pattern),
  RE2=re2::re2_match(subject, pattern),
  PCRE=regexpr(pattern, subject, perl=TRUE),
  TRE=regexpr(pattern, subject, perl=FALSE),
  setup={
    subject <- paste(rep("a", N), collapse="")
    pattern <- paste(rep(c("a?", "a"), each=N), collapse="")
  },
  N=1:25,
  times=3)
plot(atime.list)

plot of chunk unnamed-chunk-2


best.list <- atime::references_best(atime.list)
if(require(ggplot2)){
  gg <- ggplot()+
    geom_ribbon(aes(
      N, ymin=min, ymax=max, fill=expr.name),
      data=best.list$meas[unit=="seconds"],
      alpha=0.5)+
    geom_line(aes(
      N, empirical, color=expr.name),
      data=best.list$meas)+
    facet_grid(unit ~ ., scales="free")+
    theme(legend.position="none")+
    scale_x_log10()+
    scale_y_log10("median line, min/max band")
  if(require("directlabels")){
    gg+
      geom_dl(aes(
        N, empirical, color=expr.name, label=expr.class),
        method="right.polygons",
        data=best.list$meas)+
      coord_cartesian(xlim=c(1,30))
  }else{
    gg
  }
}
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Transformation introduced infinite values in continuous y-axis

plot of chunk unnamed-chunk-2

The plots above show that ICU/PCRE are exponential time whereas RE2/TRE are polynomial time. Exercise for the reader: modify the above code to use the seconds.limit argument so that you can see what happens to ICU/PCRE for larger N (hint: you should see a difference at larger sizes).