The purpose of this vignette is to make figures which show the efficiency of data.table.

fwrite: fast CSV writer

library(data.table)
library(readr)
library(arrow)
library(ggplot2)
write.colors <- c(
  "readr::write_csv"="#9970AB",
  "data.table::fwrite"="#D6604D",
  "write_csv_arrow"="#BF812D", 
  "utils::write.csv"="deepskyblue")
n.rows <- 100
seconds.limit <- 1

atime.write.vary.cols <- atime::atime(
  N=as.integer(10^seq(2, 6, by=0.5)),
  setup={
    set.seed(1)
    input.vec <- rnorm(n.rows*N)
    input.mat <- matrix(input.vec, n.rows, N)
    input.df <- data.frame(input.mat)
  },
  seconds.limit = seconds.limit,
  "data.table::fwrite"={
    data.table::fwrite(input.df, tempfile(), showProgress = FALSE)
  },
  "write_csv_arrow"={
    arrow::write_csv_arrow(input.df, tempfile())
  },
  "readr::write_csv"={
    readr::write_csv(input.df, tempfile(), progress = FALSE)
  },
  "utils::write.csv"=utils::write.csv(input.df, tempfile()))
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
refs.write.vary.cols <- atime::references_best(atime.write.vary.cols)
pred.write.vary.cols <- predict(refs.write.vary.cols)

gg.write <- plot(pred.write.vary.cols)+
  theme(text=element_text(size=20))+
  ggtitle(sprintf("Write real numbers to CSV, %d x N", n.rows))+
  scale_x_log10("N = number of columns to write")+
  scale_y_log10("Computation time (seconds)
median line, min/max band
over 10 timings")+
  facet_null()+
  scale_fill_manual(values=write.colors)+
  scale_color_manual(values=write.colors)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
gg.write
## Warning: Transformation introduced infinite values in continuous x-axis

plot of chunk write

fread: fast CSV reader

read.colors <- c(
  "readr::read_csv\n(lazy=TRUE)"="#9970AB",
  "readr::read_csv\n(lazy=FALSE)"="#9970AB",
  "data.table::fread"="#D6604D",
  "read_csv_arrow"="#BF812D", 
  "utils::read.csv"="deepskyblue")
atime.read.vary.cols <- atime::atime(
  N=as.integer(10^seq(2, 6, by=0.5)),
  setup={
    set.seed(1)
    input.vec <- rnorm(n.rows*N)
    input.mat <- matrix(input.vec, n.rows, N)
    input.df <- data.frame(input.mat)
    input.csv <- tempfile()
    fwrite(input.df, input.csv)
  },
  seconds.limit = seconds.limit,
  "data.table::fread"={
    data.table::fread(input.csv, showProgress = FALSE)
  },
  "read_csv_arrow"={
    arrow::read_csv_arrow(input.csv)
  },
  "readr::read_csv\n(lazy=TRUE)"={
    readr::read_csv(input.csv, progress = FALSE, show_col_types = FALSE, lazy=TRUE)
  },
  "readr::read_csv\n(lazy=FALSE)"={
    readr::read_csv(input.csv, progress = FALSE, show_col_types = FALSE, lazy=FALSE)
  },
  "utils::read.csv"=utils::read.csv(input.csv))
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
refs.read.vary.cols <- atime::references_best(atime.read.vary.cols)
pred.read.vary.cols <- predict(refs.read.vary.cols)

gg.read <- plot(pred.read.vary.cols)+
  theme(text=element_text(size=20))+
  ggtitle(sprintf("Read real numbers from CSV, %d x N", n.rows))+
  scale_x_log10("N = number of columns to read")+
  scale_y_log10("Computation time (seconds)
median line, min/max band
over 10 timings")+
  facet_null()+
  scale_fill_manual(values=read.colors)+
  scale_color_manual(values=read.colors)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
gg.read
## Warning: Transformation introduced infinite values in continuous x-axis

plot of chunk read

Summarize by group

ml.colors <- c(
  "dplyr::summarise"="#9970AB",
  "[.data.table"="#D6604D",
  "stats::aggregate"="deepskyblue")
options(dplyr.summarise.inform=FALSE)
n.folds <- 10
ml.atime <- atime::atime(
  N=as.integer(10^seq(2, 7, by=0.5)),
  setup={
    loss.dt <- data.table(
      name="loss", 
      fold=rep(1:n.folds, each=2*N),
      loss=rnorm(2*N*n.folds),
      set=rep(c("subtrain","validation"),each=N),
      epoch=1:N,
      key=c("set","epoch","fold"))
  },
  seconds.limit=seconds.limit,
  "[.data.table"={
    loss.dt[, .(
      loss_length=.N,
      loss_mean=mean(loss),
      loss_sd=sd(loss)
    ), by=.(set, epoch)]
  },
  "stats::aggregate"={
    res <- stats::aggregate(
      loss ~ set + epoch, 
      loss.dt, 
      function(values)list(c(
        loss_length=length(values),
        loss_mean=mean(values), 
        loss_sd=sd(values))))
    data.frame(
      subset(res, select=-loss), 
      do.call(rbind, res$loss))
  },
  "dplyr::summarise"={
    loss.dt |> 
      dplyr::group_by(set, epoch) |> 
      dplyr::summarise(
        loss_length=length(loss),
        loss_mean=mean(loss), 
        loss_sd=sd(loss))
  })
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
ml.refs <- atime::references_best(ml.atime)
ml.pred <- predict(ml.refs)
ml.gg <- plot(ml.pred)+
  theme(text=element_text(size=20))+
  ggtitle(sprintf("Mean,SD,Length over %d real numbers, N times", n.folds))+
  scale_x_log10("N = number of Mean,SD,Length to compute")+
  scale_y_log10("Computation time (seconds)
median line, min/max band
over 10 timings")+
  facet_null()+
  scale_fill_manual(values=ml.colors)+
  scale_color_manual(values=ml.colors)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
ml.gg
## Warning: Transformation introduced infinite values in continuous x-axis

plot of chunk summarize

Summarize by group, expanded

The previous section is simpler to explain, whereas this section is more comprehensive/fair, because it shows versions of data.table with and without key. Both versions are much faster than the competitors, though.

options(dplyr.summarise.inform=FALSE)
n.folds <- 10
ml.exp.atime <- atime::atime(
  N=as.integer(10^seq(2, 7, by=0.5)),
  setup={
    loss.dt <- data.table(
      name="loss", 
      fold=rep(1:n.folds, each=2*N),
      loss=rnorm(2*N*n.folds),
      set=rep(c("subtrain","validation"),each=N),
      epoch=1:N)
    key.dt <- data.table(loss.dt, key=c("set","epoch","fold"))
  },
  seconds.limit=seconds.limit,
  "[.data.table(no key)"={
    loss.dt[, .(
      loss_length=.N,
      loss_mean=mean(loss),
      loss_sd=sd(loss)
    ), by=.(set, epoch)]
  },
  "[.data.table(key)"={
    key.dt[, .(
      loss_length=.N,
      loss_mean=mean(loss),
      loss_sd=sd(loss)
    ), by=.(set, epoch)]
  },
  "stats::aggregate"={
    res <- stats::aggregate(
      loss ~ set + epoch, 
      loss.dt, 
      function(values)list(c(
        loss_length=length(values),
        loss_mean=mean(values), 
        loss_sd=sd(values))))
    data.frame(
      subset(res, select=-loss), 
      do.call(rbind, res$loss))
  },
  "dplyr::summarise"={
    loss.dt |> 
      dplyr::group_by(set, epoch) |> 
      dplyr::summarise(
        loss_length=length(loss),
        loss_mean=mean(loss), 
        loss_sd=sd(loss))
  },
  "collapse::fsummarise"={
    loss.dt |> 
      collapse::fgroup_by(set, epoch) |> 
      collapse::fsummarise(
        loss_length=length(loss),
        loss_mean=mean(loss), 
        loss_sd=sd(loss))
  })
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.

## Warning: Some expressions had a GC in every iteration; so filtering is disabled.

## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
ml.exp.refs <- atime::references_best(ml.exp.atime)
ml.exp.pred <- predict(ml.exp.refs)
ml.exp.colors <- c(
  "collapse::fsummarise"="#5AAE61",
  "dplyr::summarise"="#9970AB",
  "[.data.table(key)"="#D6604D",
  "[.data.table(no key)"="#B6604D",
  "stats::aggregate"="deepskyblue")
ml.exp.gg <- plot(ml.exp.pred)+
  theme(text=element_text(size=20))+
  ggtitle(sprintf("Mean,SD,Length over %d real numbers, N times", n.folds))+
  scale_x_log10("N = number of Mean,SD,Length to compute")+
  scale_y_log10("Computation time (seconds)
median line, min/max band
over 10 timings")+
  facet_null()+
  scale_fill_manual(values=ml.exp.colors)+
  scale_color_manual(values=ml.exp.colors)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
ml.exp.gg
## Warning: Transformation introduced infinite values in continuous x-axis

plot of chunk summarize-exp

version info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C               LC_TIME=fr_FR.UTF-8       
##  [4] LC_COLLATE=fr_FR.UTF-8     LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Phoenix
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  utils     datasets  grDevices methods   base     
## 
## other attached packages:
## [1] ggplot2_3.4.3     arrow_13.0.0      readr_2.1.4       data.table_1.14.8
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.0.5              gtable_0.3.4           highr_0.10             dplyr_1.1.2           
##  [5] compiler_4.3.1         crayon_1.5.2           Rcpp_1.0.11            tidyselect_1.2.0      
##  [9] collapse_1.9.6         parallel_4.3.1         assertthat_0.2.1       scales_1.2.1          
## [13] directlabels_2023.8.25 lattice_0.21-8         R6_2.5.1               generics_0.1.3        
## [17] knitr_1.43             tibble_3.2.1           munsell_0.5.0          atime_2023.10.9       
## [21] pillar_1.9.0           tzdb_0.3.0             rlang_1.1.1            utf8_1.2.3            
## [25] xfun_0.40              quadprog_1.5-8         bit64_4.0.5            cli_3.6.1             
## [29] withr_2.5.0            magrittr_2.0.3         grid_4.3.1             vroom_1.6.3           
## [33] hms_1.1.3              lifecycle_1.0.3        vctrs_0.6.3            bench_1.1.3           
## [37] evaluate_0.21          glue_1.6.2             farver_2.1.1           profmem_0.6.0         
## [41] fansi_1.0.4            colorspace_2.1-0       purrr_1.0.2            tools_4.3.1           
## [45] pkgconfig_2.0.3