Skip to contents

The goal of duckplyr is to provide a drop-in replacement for dplyr that uses DuckDB as a backend for fast operation. DuckDB is an in-process SQL OLAP database management system.

duckplyr also defines a set of generics that provide a low-level implementer’s interface for dplyr’s high-level user interface.

Installation

Install duckplyr from CRAN with:

install.packages("duckplyr")

You can also install the development version of duckplyr from R-universe:

install.packages('duckplyr', repos = c('https://duckdblabs.r-universe.dev', 'https://cloud.r-project.org'))

Or from GitHub with:

# install.packages("pak", repos = sprintf("https://r-lib.github.io/p/pak/stable/%s/%s/%s", .Platform$pkgType, R.Version()$os, R.Version()$arch))
pak::pak("duckdblabs/duckplyr")

Examples

library(conflicted)
library(duckplyr)
conflict_prefer("filter", "duckplyr")
#> [conflicted] Will prefer duckplyr::filter over
#> any other package.

There are two ways to use duckplyr.

  1. To enable duckplyr for individual data frames, use as_duckplyr_df() as the first step in your pipe.
  2. To enable duckplyr for the entire session, use methods_overwrite().

The examples below illustrate both methods. See also the companion demo repository for a use case with a large dataset.

Usage for individual data frames

This example illustrates usage of duckplyr for individual data frames.

Use as_duckplyr_df() to enable processing with duckdb:

out <-
  palmerpenguins::penguins %>%
  # CAVEAT: factor columns are not supported yet
  mutate(across(where(is.factor), as.character)) %>%
  as_duckplyr_df() %>%
  mutate(bill_area = bill_length_mm * bill_depth_mm) %>%
  summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>%
  filter(species != "Gentoo")

The result is a data frame or tibble, with its own class.

class(out)
#> [1] "duckplyr_df" "tbl_df"      "tbl"         "data.frame"
names(out)
#> [1] "species"        "sex"            "mean_bill_area"

duckdb is responsible for eventually carrying out the operations. Despite the late filter, the summary is not computed for the Gentoo species.

out %>%
  explain()
#> ┌───────────────────────────┐
#> │          ORDER_BY         │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          ORDERS:          │
#> │      dataframe_42_42      │
#> │      42.___row_number ASC     │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │           FILTER          │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │   (species != 'Gentoo')   │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          EC: 344          │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │      STREAMING_WINDOW     │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │    ROW_NUMBER() OVER ()   │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │          ORDER_BY         │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          ORDERS:          │
#> │      dataframe_42_42      │
#> │      42.___row_number ASC     │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │       HASH_GROUP_BY       │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │             #0            │
#> │             #1            │
#> │          min(#2)          │
#> │          mean(#3)         │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          species          │
#> │            sex            │
#> │       ___row_number       │
#> │         bill_area         │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │      STREAMING_WINDOW     │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │    ROW_NUMBER() OVER ()   │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          species          │
#> │            sex            │
#> │         bill_area         │
#> └─────────────┬─────────────┘                             
#> ┌─────────────┴─────────────┐
#> │     R_DATAFRAME_SCAN      │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │         data.frame        │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          species          │
#> │       bill_length_mm      │
#> │       bill_depth_mm       │
#> │            sex            │
#> │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
#> │          EC: 344          │
#> └───────────────────────────┘

All data frame operations are supported. Computation happens upon the first request.

out$mean_bill_area
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>   Order [___row_number ASC]
#>     Filter [!=(species, 'Gentoo')]
#>       Projection [species as species, sex as sex, mean_bill_area as mean_bill_area, row_number() OVER () as ___row_number]
#>         Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>           Order [___row_number ASC]
#>             Aggregate [species, sex, min(___row_number), mean(bill_area)]
#>               Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, bill_area as bill_area, row_number() OVER () as ___row_number]
#>                 Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, *(bill_length_mm, bill_depth_mm) as bill_area]
#>                   r_dataframe_scan(0xdeadbeef)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - species (VARCHAR)
#> - sex (VARCHAR)
#> - mean_bill_area (DOUBLE)
#> 
#> [1] 770.2627 656.8523 694.9360 819.7503 984.2279

After the computation has been carried out, the results are available immediately:

out
#> # A tibble: 5 × 3
#>   species   sex    mean_bill_area
#>   <chr>     <chr>           <dbl>
#> 1 Adelie    male             770.
#> 2 Adelie    female           657.
#> 3 Adelie    NA               695.
#> 4 Chinstrap female           820.
#> 5 Chinstrap male             984.

Session-wide usage

This example illustrates usage of duckplyr for all data frames in the R session.

Use methods_overwrite() to enable processing with duckdb for all data frames:

methods_overwrite()

This is the same query as above, without as_duckplyr_df():

out <-
  palmerpenguins::penguins %>%
  # CAVEAT: factor columns are not supported yet
  mutate(across(where(is.factor), as.character)) %>%
  mutate(bill_area = bill_length_mm * bill_depth_mm) %>%
  summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>%
  filter(species != "Gentoo")

The result is a plain tibble now:

class(out)
#> [1] "tbl_df"     "tbl"        "data.frame"

Querying the number of rows also starts the computation:

nrow(out)
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>   Order [___row_number ASC]
#>     Filter [!=(species, 'Gentoo')]
#>       Projection [species as species, sex as sex, mean_bill_area as mean_bill_area, row_number() OVER () as ___row_number]
#>         Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>           Order [___row_number ASC]
#>             Aggregate [species, sex, min(___row_number), mean(bill_area)]
#>               Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, bill_area as bill_area, row_number() OVER () as ___row_number]
#>                 Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, *(bill_length_mm, bill_depth_mm) as bill_area]
#>                   r_dataframe_scan(0xdeadbeef)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - species (VARCHAR)
#> - sex (VARCHAR)
#> - mean_bill_area (DOUBLE)
#> [1] 5

Restart R, or call methods_restore() to revert to the default dplyr implementation.

methods_restore()

dplyr is active again:

palmerpenguins::penguins %>%
  # CAVEAT: factor columns are not supported yet
  mutate(across(where(is.factor), as.character)) %>%
  mutate(bill_area = bill_length_mm * bill_depth_mm) %>%
  summarize(.by = c(species, sex), mean_bill_area = mean(bill_area)) %>%
  filter(species != "Gentoo")
#> materializing:
#> ---------------------
#> --- Relation Tree ---
#> ---------------------
#> Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>   Order [___row_number ASC]
#>     Filter [!=(species, 'Gentoo')]
#>       Projection [species as species, sex as sex, mean_bill_area as mean_bill_area, row_number() OVER () as ___row_number]
#>         Projection [species as species, sex as sex, mean_bill_area as mean_bill_area]
#>           Order [___row_number ASC]
#>             Aggregate [species, sex, min(___row_number), mean(bill_area)]
#>               Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, bill_area as bill_area, row_number() OVER () as ___row_number]
#>                 Projection [species as species, island as island, bill_length_mm as bill_length_mm, bill_depth_mm as bill_depth_mm, flipper_length_mm as flipper_length_mm, body_mass_g as body_mass_g, sex as sex, "year" as year, *(bill_length_mm, bill_depth_mm) as bill_area]
#>                   r_dataframe_scan(0xdeadbeef)
#> 
#> ---------------------
#> -- Result Columns  --
#> ---------------------
#> - species (VARCHAR)
#> - sex (VARCHAR)
#> - mean_bill_area (DOUBLE)
#> 
#> # A tibble: 5 × 3
#>   species   sex    mean_bill_area
#>   <chr>     <chr>           <dbl>
#> 1 Adelie    male             770.
#> 2 Adelie    female           657.
#> 3 Adelie    NA               695.
#> 4 Chinstrap female           820.
#> 5 Chinstrap male             984.

Extensibility

This package also provides generics, for which other packages may then implement methods.

library(duckplyr)

# Create a relational to be used by examples below
new_dfrel <- function(x) {
  stopifnot(is.data.frame(x))
  new_relational(list(x), class = "dfrel")
}
mtcars_rel <- new_dfrel(mtcars[1:5, 1:4])

# Example 1: return a data.frame
rel_to_df.dfrel <- function(rel, ...) {
  unclass(rel)[[1]]
}
rel_to_df(mtcars_rel)
#>                    mpg cyl disp  hp
#> Mazda RX4         21.0   6  160 110
#> Mazda RX4 Wag     21.0   6  160 110
#> Datsun 710        22.8   4  108  93
#> Hornet 4 Drive    21.4   6  258 110
#> Hornet Sportabout 18.7   8  360 175

# Example 2: A (random) filter
rel_filter.dfrel <- function(rel, exprs, ...) {
  df <- unclass(rel)[[1]]

  # A real implementation would evaluate the predicates defined
  # by the exprs argument
  new_dfrel(df[sample.int(nrow(df), 3, replace = TRUE), ])
}

rel_filter(
  mtcars_rel,
  list(
    relexpr_function(
      "gt",
      list(relexpr_reference("cyl"), relexpr_constant("6"))
    )
  )
)
#> [[1]]
#>                  mpg cyl disp  hp
#> Mazda RX4 Wag   21.0   6  160 110
#> Mazda RX4 Wag.1 21.0   6  160 110
#> Datsun 710      22.8   4  108  93
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 3: A custom projection
rel_project.dfrel <- function(rel, exprs, ...) {
  df <- unclass(rel)[[1]]

  # A real implementation would evaluate the expressions defined
  # by the exprs argument
  new_dfrel(df[seq_len(min(3, ncol(df)))])
}

rel_project(
  mtcars_rel,
  list(relexpr_reference("cyl"), relexpr_reference("disp"))
)
#> [[1]]
#>                    mpg cyl disp
#> Mazda RX4         21.0   6  160
#> Mazda RX4 Wag     21.0   6  160
#> Datsun 710        22.8   4  108
#> Hornet 4 Drive    21.4   6  258
#> Hornet Sportabout 18.7   8  360
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 4: A custom ordering (eg, ascending by mpg)
rel_order.dfrel <- function(rel, exprs, ...) {
  df <- unclass(rel)[[1]]

  # A real implementation would evaluate the expressions defined
  # by the exprs argument
  new_dfrel(df[order(df[[1]]), ])
}

rel_order(
  mtcars_rel,
  list(relexpr_reference("mpg"))
)
#> [[1]]
#>                    mpg cyl disp  hp
#> Hornet Sportabout 18.7   8  360 175
#> Mazda RX4         21.0   6  160 110
#> Mazda RX4 Wag     21.0   6  160 110
#> Hornet 4 Drive    21.4   6  258 110
#> Datsun 710        22.8   4  108  93
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 5: A custom join
rel_join.dfrel <- function(left, right, conds, join, ...) {
  left_df <- unclass(left)[[1]]
  right_df <- unclass(right)[[1]]

  # A real implementation would evaluate the expressions
  # defined by the conds argument,
  # use different join types based on the join argument,
  # and implement the join itself instead of relaying to left_join().
  new_dfrel(dplyr::left_join(left_df, right_df))
}

rel_join(new_dfrel(data.frame(mpg = 21)), mtcars_rel)
#> Joining with `by = join_by(mpg)`
#> Joining with `by = join_by(mpg)`
#> [[1]]
#>   mpg cyl disp  hp
#> 1  21   6  160 110
#> 2  21   6  160 110
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 6: Limit the maximum rows returned
rel_limit.dfrel <- function(rel, n, ...) {
  df <- unclass(rel)[[1]]

  new_dfrel(df[seq_len(n), ])
}

rel_limit(mtcars_rel, 3)
#> [[1]]
#>                mpg cyl disp  hp
#> Mazda RX4     21.0   6  160 110
#> Mazda RX4 Wag 21.0   6  160 110
#> Datsun 710    22.8   4  108  93
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 7: Suppress duplicate rows
#  (ignoring row names)
rel_distinct.dfrel <- function(rel, ...) {
  df <- unclass(rel)[[1]]

  new_dfrel(df[!duplicated(df), ])
}

rel_distinct(new_dfrel(mtcars[1:3, 1:4]))
#> [[1]]
#>             mpg cyl disp  hp
#> Mazda RX4  21.0   6  160 110
#> Datsun 710 22.8   4  108  93
#> 
#> attr(,"class")
#> [1] "dfrel"      "relational"

# Example 8: Return column names
rel_names.dfrel <- function(rel, ...) {
  df <- unclass(rel)[[1]]

  names(df)
}

rel_names(mtcars_rel)
#> [1] "mpg"  "cyl"  "disp" "hp"