diff --git a/.Rbuildignore b/.Rbuildignore index 4e090adda..b2d1c0c17 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -32,3 +32,4 @@ ^\.editorconfig$ ^rustfmt\.toml$ ^\.lintr\.R$ +^\.mega-linter\.yml$ diff --git a/.editorconfig b/.editorconfig index 98f40b5ae..22adfb308 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,6 +3,8 @@ root = true [*] insert_final_newline = true end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true [*.md] indent_style = space diff --git a/.github/workflows/mega-linter.yaml b/.github/workflows/mega-linter.yaml new file mode 100644 index 000000000..2778e014f --- /dev/null +++ b/.github/workflows/mega-linter.yaml @@ -0,0 +1,79 @@ +--- +# MegaLinter GitHub Action configuration file +# More info at https://megalinter.io +name: MegaLinter + +on: + pull_request: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + megalinter: + name: MegaLinter + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: MegaLinter + id: ml + # You can override MegaLinter flavor used to have faster performances + # More info at https://megalinter.io/flavors/ + uses: oxsecurity/megalinter/flavors/cupcake@v7.9.0 + env: + # All available variables are described in documentation + # https://megalinter.io/configuration/ + VALIDATE_ALL_CODEBASE: true + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # ADD YOUR CUSTOM ENV VARIABLES HERE OR DEFINE THEM IN A FILE .mega-linter.yml AT THE ROOT OF YOUR REPOSITORY + # DISABLE: COPYPASTE,SPELL # Uncomment to disable copy-paste and spell checks + + # Upload MegaLinter artifacts + - name: Archive production artifacts + if: success() || failure() + uses: actions/upload-artifact@v4 + with: + name: MegaLinter reports + path: | + megalinter-reports + mega-linter.log + + # Create pull request if applicable (for now works only on PR from same repository, not from forks) + - name: Create Pull Request with applied fixes + id: cpr + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }} + commit-message: "[MegaLinter] Apply linters automatic fixes" + title: "[MegaLinter] Apply linters automatic fixes" + labels: bot + - name: Create PR output + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + run: | + echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" + echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" + + # Push new commit if applicable (for now works only on PR from same repository, not from forks) + - name: Prepare commit + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + run: sudo chown -Rc $UID .git/ + - name: Commit and push applied linter fixes + if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix') + uses: stefanzweifel/git-auto-commit-action@v4 + with: + branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }} + commit_message: "[MegaLinter] Apply linters fixes" + commit_user_name: megalinter-bot + commit_user_email: nicolas.vuillamy@ox.security diff --git a/.mega-linter.yml b/.mega-linter.yml new file mode 100644 index 000000000..92000fcb2 --- /dev/null +++ b/.mega-linter.yml @@ -0,0 +1,6 @@ +APPLY_FIXES: all +DISABLE: + - RUST + - R +DISABLE_LINTERS: + - SPELL_CSPELL diff --git a/NEWS.md b/NEWS.md index 77aa41f66..8664c5138 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,7 @@ - It is now possible to create an empty `DataFrame` with a specific schema with `pl$DataFrame(schema = my_schema)` (#901). - New arguments `dtype` and `nan_to_null` for `pl$Series()` (#902). +- New method `$partition_by()` (#898). ### Bug fixes diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 4e71934ae..5f576e2c0 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -884,6 +884,8 @@ DataFrame_filter = function(...) { #' @details Within each group, the order of the rows is always preserved, #' regardless of the `maintain_order` argument. #' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`) +#' @seealso +#' - [`$partition_by()`][DataFrame_partition_by] #' @examples #' df = pl$DataFrame( #' a = c("a", "b", "a", "b", "c"), @@ -2093,3 +2095,108 @@ DataFrame_group_by_dynamic = function( by, start_by, check_sorted ) } + + +#' Split a DataFrame into multiple DataFrames +#' +#' Similar to [`$group_by()`][DataFrame_group_by]. +#' Group by the given columns and return the groups as separate [DataFrames][DataFrame_class]. +#' It is useful to use this in combination with functions like [lapply()] or `purrr::map()`. +#' @param ... Characters of column names to group by. Passed to [`pl$col()`][pl_col]. +#' @param maintain_order If `TRUE`, ensure that the order of the groups is consistent with the input data. +#' This is slower than a default partition by operation. +#' @param include_key If `TRUE`, include the columns used to partition the DataFrame in the output. +#' @param as_nested_list This affects the format of the output. +#' If `FALSE` (default), the output is a flat [list] of [DataFrames][DataFrame_class]. +#' IF `TRUE` and one of the `maintain_order` or `include_key` argument is `TRUE`, +#' then each element of the output has two children: `key` and `data`. +#' See the examples for more details. +#' @return A list of [DataFrames][DataFrame_class]. See the examples for details. +#' @seealso +#' - [`$group_by()`][DataFrame_group_by] +#' @examples +#' df = pl$DataFrame( +#' a = c("a", "b", "a", "b", "c"), +#' b = c(1, 2, 1, 3, 3), +#' c = c(5, 4, 3, 2, 1) +#' ) +#' df +#' +#' # Pass a single column name to partition by that column. +#' df$partition_by("a") +#' +#' # Partition by multiple columns. +#' df$partition_by("a", "b") +#' +#' # Partition by column data type +#' df$partition_by(pl$String) +#' +#' # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. +#' # The `key` is a named list of the key values, and the `data` is the DataFrame. +#' df$partition_by("a", "b", as_nested_list = TRUE) +#' +#' # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. +#' tryCatch( +#' df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), +#' warning = function(w) w +#' ) +#' +#' # Example of using with lapply(), and printing the key and the data summary +#' df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> +#' lapply(\(x) { +#' sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |> +#' cat() +#' x$data$drop(names(x$key))$describe() |> +#' print() +#' invisible(NULL) +#' }) |> +#' invisible() +DataFrame_partition_by = function( + ..., + maintain_order = TRUE, + include_key = TRUE, + as_nested_list = FALSE) { + uw = \(res) unwrap(res, "in $partition_by():") + + by = result(dots_to_colnames(self, ...)) |> + uw() + + if (!length(by)) { + Err_plain("There is no column to partition by.") |> + uw() + } + + partitions = .pr$DataFrame$partition_by(self, by, maintain_order, include_key) |> + uw() + + if (isTRUE(as_nested_list)) { + if (include_key) { + out = lapply(seq_along(partitions), \(index) { + data = partitions[[index]] + key = data$select(by)$head(1)$to_list() + + list(key = key, data = data) + }) + + return(out) + } else if (maintain_order) { + key_df = self$select(by)$unique(maintain_order = TRUE) + out = lapply(seq_along(partitions), \(index) { + data = partitions[[index]] + key = key_df$slice(index - 1, 1)$to_list() + + list(key = key, data = data) + }) + + return(out) + } else { + warning( + "cannot use `$partition_by` with ", + "`maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. ", + "Fall back to a flat list." + ) + } + } + + partitions +} diff --git a/R/dotdotdot.R b/R/dotdotdot.R index 3fb0e41b2..cbf281fd8 100644 --- a/R/dotdotdot.R +++ b/R/dotdotdot.R @@ -45,9 +45,9 @@ unpack_list = function(..., .context = NULL, .call = sys.call(1L), skip_classes l = list2(..., .context = .context, .call = .call) if ( length(l) == 1L && - is.list(l[[1L]]) && - !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) && - is.null(names(l)) + is.list(l[[1L]]) && + !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) && + is.null(names(l)) ) { l[[1L]] } else { @@ -79,3 +79,13 @@ unpack_bool_expr_result = function(...) { } }) } + + +#' Convert dots to a character vector of column names +#' @param .df [RPolarsDataFrame] +#' @param ... Arguments to pass to [`pl$col()`][pl_col] +#' @noRd +dots_to_colnames = function(.df, ..., .call = sys.call(1L)) { + result(pl$DataFrame(schema = .df$schema)$select(pl$col(...))$columns) |> + unwrap(call = .call) +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 60e999f8d..d2e54ff8f 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -178,6 +178,8 @@ RPolarsDataFrame$to_struct <- function(name) .Call(wrap__RPolarsDataFrame__to_st RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest, self, names) +RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key) + RPolarsDataFrame$export_stream <- function(stream_ptr) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr)) RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr) diff --git a/man/DataFrame_group_by.Rd b/man/DataFrame_group_by.Rd index d4535326a..1a2fbddb1 100644 --- a/man/DataFrame_group_by.Rd +++ b/man/DataFrame_group_by.Rd @@ -53,3 +53,8 @@ df$group_by(d = "a", e = pl$col("b") \%/\% 2)$agg( pl$col("c")$mean() ) } +\seealso{ +\itemize{ +\item \code{\link[=DataFrame_partition_by]{$partition_by()}} +} +} diff --git a/man/DataFrame_partition_by.Rd b/man/DataFrame_partition_by.Rd new file mode 100644 index 000000000..d2fdcea3b --- /dev/null +++ b/man/DataFrame_partition_by.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_partition_by} +\alias{DataFrame_partition_by} +\title{Split a DataFrame into multiple DataFrames} +\usage{ +DataFrame_partition_by( + ..., + maintain_order = TRUE, + include_key = TRUE, + as_nested_list = FALSE +) +} +\arguments{ +\item{...}{Characters of column names to group by. Passed to \code{\link[=pl_col]{pl$col()}}.} + +\item{maintain_order}{If \code{TRUE}, ensure that the order of the groups is consistent with the input data. +This is slower than a default partition by operation.} + +\item{include_key}{If \code{TRUE}, include the columns used to partition the DataFrame in the output.} + +\item{as_nested_list}{This affects the format of the output. +If \code{FALSE} (default), the output is a flat \link{list} of \link[=DataFrame_class]{DataFrames}. +IF \code{TRUE} and one of the \code{maintain_order} or \code{include_key} argument is \code{TRUE}, +then each element of the output has two children: \code{key} and \code{data}. +See the examples for more details.} +} +\value{ +A list of \link[=DataFrame_class]{DataFrames}. See the examples for details. +} +\description{ +Similar to \code{\link[=DataFrame_group_by]{$group_by()}}. +Group by the given columns and return the groups as separate \link[=DataFrame_class]{DataFrames}. +It is useful to use this in combination with functions like \code{\link[=lapply]{lapply()}} or \code{purrr::map()}. +} +\examples{ +df = pl$DataFrame( + a = c("a", "b", "a", "b", "c"), + b = c(1, 2, 1, 3, 3), + c = c(5, 4, 3, 2, 1) +) +df + +# Pass a single column name to partition by that column. +df$partition_by("a") + +# Partition by multiple columns. +df$partition_by("a", "b") + +# Partition by column data type +df$partition_by(pl$String) + +# If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field. +# The `key` is a named list of the key values, and the `data` is the DataFrame. +df$partition_by("a", "b", as_nested_list = TRUE) + +# `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`. +tryCatch( + df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE), + warning = function(w) w +) + +# Example of using with lapply(), and printing the key and the data summary +df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |> + lapply(\(x) { + sprintf("\nThe key value of `a` is \%s and the key value of `b` is \%s\n", x$key$a, x$key$b) |> + cat() + x$data$drop(names(x$key))$describe() |> + print() + invisible(NULL) + }) |> + invisible() +} +\seealso{ +\itemize{ +\item \code{\link[=DataFrame_group_by]{$group_by()}} +} +} diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 501626fa6..7ac01120b 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -328,6 +328,21 @@ impl RPolarsDataFrame { self.lazy().unnest(names)?.collect() } + pub fn partition_by(&self, by: Robj, maintain_order: Robj, include_key: Robj) -> RResult { + let by = robj_to!(Vec, String, by)?; + let maintain_order = robj_to!(bool, maintain_order)?; + let include_key = robj_to!(bool, include_key)?; + let out = if maintain_order { + self.0.clone().partition_by_stable(by, include_key) + } else { + self.0.partition_by(by, include_key) + } + .map_err(polars_to_rpolars_err)?; + + let vec = unsafe { std::mem::transmute::, Vec>(out) }; + Ok(List::from_values(vec)) + } + pub fn export_stream(&self, stream_ptr: &str) { let schema = self.0.schema().to_arrow(false); let data_type = ArrowDataType::Struct(schema.fields); diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index bab1e4683..9ac105d87 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -79,16 +79,16 @@ [21] "group_by_dynamic" "head" "height" "join" [25] "join_asof" "last" "lazy" "limit" [29] "max" "mean" "median" "melt" - [33] "min" "n_chunks" "null_count" "pivot" - [37] "print" "quantile" "rechunk" "rename" - [41] "reverse" "rolling" "sample" "schema" - [45] "select" "shape" "shift" "shift_and_fill" - [49] "slice" "sort" "std" "sum" - [53] "tail" "to_data_frame" "to_list" "to_series" - [57] "to_struct" "transpose" "unique" "unnest" - [61] "var" "width" "with_columns" "with_row_count" - [65] "with_row_index" "write_csv" "write_json" "write_ndjson" - [69] "write_parquet" + [33] "min" "n_chunks" "null_count" "partition_by" + [37] "pivot" "print" "quantile" "rechunk" + [41] "rename" "reverse" "rolling" "sample" + [45] "schema" "select" "shape" "shift" + [49] "shift_and_fill" "slice" "sort" "std" + [53] "sum" "tail" "to_data_frame" "to_list" + [57] "to_series" "to_struct" "transpose" "unique" + [61] "unnest" "var" "width" "with_columns" + [65] "with_row_count" "with_row_index" "write_csv" "write_json" + [69] "write_ndjson" "write_parquet" --- @@ -104,18 +104,19 @@ [13] "get_columns" "lazy" [15] "melt" "n_chunks" [17] "new_with_capacity" "null_count" - [19] "pivot_expr" "print" - [21] "rechunk" "sample_frac" - [23] "sample_n" "schema" - [25] "select" "select_at_idx" - [27] "set_column_from_robj" "set_column_from_series" - [29] "set_column_names_mut" "shape" - [31] "to_list" "to_list_tag_structs" - [33] "to_list_unwind" "to_struct" - [35] "transpose" "unnest" - [37] "with_columns" "with_row_index" - [39] "write_csv" "write_json" - [41] "write_ndjson" "write_parquet" + [19] "partition_by" "pivot_expr" + [21] "print" "rechunk" + [23] "sample_frac" "sample_n" + [25] "schema" "select" + [27] "select_at_idx" "set_column_from_robj" + [29] "set_column_from_series" "set_column_names_mut" + [31] "shape" "to_list" + [33] "to_list_tag_structs" "to_list_unwind" + [35] "to_struct" "transpose" + [37] "unnest" "with_columns" + [39] "with_row_index" "write_csv" + [41] "write_json" "write_ndjson" + [43] "write_parquet" # public and private methods of each class GroupBy diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index c599f6d2a..f15c7f3b2 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -1324,3 +1324,84 @@ test_that("flags work", { ) ) }) + +test_that("partition_by", { + df = pl$DataFrame( + col1 = 1:5, + col2 = c("a", "a", "b", "b", "b"), + col3 = c(rep_len("c", 3), rep_len("d", 2)) + ) + + # Test `maintain_order = TRUE` + expect_true( + df$equals(pl$concat(df$partition_by("col2"))) + ) + expect_true( + df$equals(pl$concat(df$partition_by("col2", "col3"))) + ) + expect_true( + df$drop("col3")$equals(pl$concat(df$partition_by("col3", include_key = FALSE))) + ) + + # Test `maintain_order = FALSE` + df_sorted = df$sort(pl$all()) + expect_true( + df_sorted$equals(pl$concat(df$partition_by("col2", maintain_order = FALSE))$sort(pl$all())) + ) + expect_true( + df_sorted$equals(pl$concat(df$partition_by("col2", "col3", maintain_order = FALSE))$sort(pl$all())) + ) + expect_true( + df$drop("col3")$sort(pl$all())$equals( + pl$concat(df$partition_by("col3", include_key = FALSE, maintain_order = FALSE))$sort(pl$all()) + ) + ) + + # Test selecting columns by data type + expect_true( + mapply( + df$partition_by("col2", "col3"), + df$partition_by(pl$String), + FUN = \(x, y) x$equals(y) + ) |> + all() + ) + + # Test errors + expect_error(df$partition_by("foo"), "not found: foo") + expect_error(df$partition_by(pl$Int8), "There is no column to partition by") + + # Test `as_nested_list = TRUE` + expect_true( + mapply( + df$partition_by("col2", "col3"), + df$partition_by("col2", "col3", as_nested_list = TRUE), + FUN = \(x, y) x$equals(y$data) + ) |> + all() + ) + expect_true( + mapply( + df$partition_by("col2", "col3", include_key = FALSE), + df$partition_by("col2", "col3", as_nested_list = TRUE, include_key = FALSE), + FUN = \(x, y) x$equals(y$data) + ) |> + all() + ) + expect_true( + df$partition_by("col2", "col3", as_nested_list = TRUE, include_key = FALSE) |> + lapply(\(x) x$data$with_columns(col2 = pl$lit(x$key$col2), col3 = pl$lit(x$key$col3))) |> + pl$concat() |> + df$equals() + ) + expect_true( + df$partition_by("col2", "col3", as_nested_list = TRUE, maintain_order = FALSE) |> + lapply(\(x) x$data) |> + pl$concat() |> + (\(x) df$equals(x$sort(pl$all())))() + ) + + expect_warning( + df$partition_by("col2", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE) + ) +})