Skip to content

Commit

Permalink
Merge branch 'main' into polars-0.38.2
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher authored Mar 11, 2024
2 parents 49caae7 + ee6b7f9 commit 0af2cf7
Show file tree
Hide file tree
Showing 13 changed files with 413 additions and 25 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@
^\.editorconfig$
^rustfmt\.toml$
^\.lintr\.R$
^\.mega-linter\.yml$
2 changes: 2 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ root = true
[*]
insert_final_newline = true
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true

[*.md]
indent_style = space
Expand Down
79 changes: 79 additions & 0 deletions .github/workflows/mega-linter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
---
# MegaLinter GitHub Action configuration file
# More info at https://megalinter.io
name: MegaLinter

on:
pull_request:
branches:
- main
workflow_dispatch:

concurrency:
group: ${{ github.ref }}-${{ github.workflow }}
cancel-in-progress: true

jobs:
megalinter:
name: MegaLinter
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
fetch-depth: 0

- name: MegaLinter
id: ml
# You can override MegaLinter flavor used to have faster performances
# More info at https://megalinter.io/flavors/
uses: oxsecurity/megalinter/flavors/cupcake@v7.9.0
env:
# All available variables are described in documentation
# https://megalinter.io/configuration/
VALIDATE_ALL_CODEBASE: true
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# ADD YOUR CUSTOM ENV VARIABLES HERE OR DEFINE THEM IN A FILE .mega-linter.yml AT THE ROOT OF YOUR REPOSITORY
# DISABLE: COPYPASTE,SPELL # Uncomment to disable copy-paste and spell checks

# Upload MegaLinter artifacts
- name: Archive production artifacts
if: success() || failure()
uses: actions/upload-artifact@v4
with:
name: MegaLinter reports
path: |
megalinter-reports
mega-linter.log
# Create pull request if applicable (for now works only on PR from same repository, not from forks)
- name: Create Pull Request with applied fixes
id: cpr
if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
uses: peter-evans/create-pull-request@v6
with:
token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
commit-message: "[MegaLinter] Apply linters automatic fixes"
title: "[MegaLinter] Apply linters automatic fixes"
labels: bot
- name: Create PR output
if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
# Push new commit if applicable (for now works only on PR from same repository, not from forks)
- name: Prepare commit
if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
run: sudo chown -Rc $UID .git/
- name: Commit and push applied linter fixes
if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
uses: stefanzweifel/git-auto-commit-action@v4
with:
branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }}
commit_message: "[MegaLinter] Apply linters fixes"
commit_user_name: megalinter-bot
commit_user_email: nicolas.vuillamy@ox.security
6 changes: 6 additions & 0 deletions .mega-linter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
APPLY_FIXES: all
DISABLE:
- RUST
- R
DISABLE_LINTERS:
- SPELL_CSPELL
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- It is now possible to create an empty `DataFrame` with a specific schema
with `pl$DataFrame(schema = my_schema)` (#901).
- New arguments `dtype` and `nan_to_null` for `pl$Series()` (#902).
- New method `<DataFrame>$partition_by()` (#898).

### Bug fixes

Expand Down
107 changes: 107 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,8 @@ DataFrame_filter = function(...) {
#' @details Within each group, the order of the rows is always preserved,
#' regardless of the `maintain_order` argument.
#' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`)
#' @seealso
#' - [`<DataFrame>$partition_by()`][DataFrame_partition_by]
#' @examples
#' df = pl$DataFrame(
#' a = c("a", "b", "a", "b", "c"),
Expand Down Expand Up @@ -2093,3 +2095,108 @@ DataFrame_group_by_dynamic = function(
by, start_by, check_sorted
)
}


#' Split a DataFrame into multiple DataFrames
#'
#' Similar to [`$group_by()`][DataFrame_group_by].
#' Group by the given columns and return the groups as separate [DataFrames][DataFrame_class].
#' It is useful to use this in combination with functions like [lapply()] or `purrr::map()`.
#' @param ... Characters of column names to group by. Passed to [`pl$col()`][pl_col].
#' @param maintain_order If `TRUE`, ensure that the order of the groups is consistent with the input data.
#' This is slower than a default partition by operation.
#' @param include_key If `TRUE`, include the columns used to partition the DataFrame in the output.
#' @param as_nested_list This affects the format of the output.
#' If `FALSE` (default), the output is a flat [list] of [DataFrames][DataFrame_class].
#' IF `TRUE` and one of the `maintain_order` or `include_key` argument is `TRUE`,
#' then each element of the output has two children: `key` and `data`.
#' See the examples for more details.
#' @return A list of [DataFrames][DataFrame_class]. See the examples for details.
#' @seealso
#' - [`<DataFrame>$group_by()`][DataFrame_group_by]
#' @examples
#' df = pl$DataFrame(
#' a = c("a", "b", "a", "b", "c"),
#' b = c(1, 2, 1, 3, 3),
#' c = c(5, 4, 3, 2, 1)
#' )
#' df
#'
#' # Pass a single column name to partition by that column.
#' df$partition_by("a")
#'
#' # Partition by multiple columns.
#' df$partition_by("a", "b")
#'
#' # Partition by column data type
#' df$partition_by(pl$String)
#'
#' # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field.
#' # The `key` is a named list of the key values, and the `data` is the DataFrame.
#' df$partition_by("a", "b", as_nested_list = TRUE)
#'
#' # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`.
#' tryCatch(
#' df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE),
#' warning = function(w) w
#' )
#'
#' # Example of using with lapply(), and printing the key and the data summary
#' df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |>
#' lapply(\(x) {
#' sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |>
#' cat()
#' x$data$drop(names(x$key))$describe() |>
#' print()
#' invisible(NULL)
#' }) |>
#' invisible()
DataFrame_partition_by = function(
...,
maintain_order = TRUE,
include_key = TRUE,
as_nested_list = FALSE) {
uw = \(res) unwrap(res, "in $partition_by():")

by = result(dots_to_colnames(self, ...)) |>
uw()

if (!length(by)) {
Err_plain("There is no column to partition by.") |>
uw()
}

partitions = .pr$DataFrame$partition_by(self, by, maintain_order, include_key) |>
uw()

if (isTRUE(as_nested_list)) {
if (include_key) {
out = lapply(seq_along(partitions), \(index) {
data = partitions[[index]]
key = data$select(by)$head(1)$to_list()

list(key = key, data = data)
})

return(out)
} else if (maintain_order) {
key_df = self$select(by)$unique(maintain_order = TRUE)
out = lapply(seq_along(partitions), \(index) {
data = partitions[[index]]
key = key_df$slice(index - 1, 1)$to_list()

list(key = key, data = data)
})

return(out)
} else {
warning(
"cannot use `$partition_by` with ",
"`maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. ",
"Fall back to a flat list."
)
}
}

partitions
}
16 changes: 13 additions & 3 deletions R/dotdotdot.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ unpack_list = function(..., .context = NULL, .call = sys.call(1L), skip_classes
l = list2(..., .context = .context, .call = .call)
if (
length(l) == 1L &&
is.list(l[[1L]]) &&
!(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) &&
is.null(names(l))
is.list(l[[1L]]) &&
!(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) &&
is.null(names(l))
) {
l[[1L]]
} else {
Expand Down Expand Up @@ -79,3 +79,13 @@ unpack_bool_expr_result = function(...) {
}
})
}


#' Convert dots to a character vector of column names
#' @param .df [RPolarsDataFrame]
#' @param ... Arguments to pass to [`pl$col()`][pl_col]
#' @noRd
dots_to_colnames = function(.df, ..., .call = sys.call(1L)) {
result(pl$DataFrame(schema = .df$schema)$select(pl$col(...))$columns) |>
unwrap(call = .call)
}
2 changes: 2 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ RPolarsDataFrame$to_struct <- function(name) .Call(wrap__RPolarsDataFrame__to_st

RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest, self, names)

RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key)

RPolarsDataFrame$export_stream <- function(stream_ptr) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr))

RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr)
Expand Down
5 changes: 5 additions & 0 deletions man/DataFrame_group_by.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

78 changes: 78 additions & 0 deletions man/DataFrame_partition_by.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions src/rust/src/rdataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,21 @@ impl RPolarsDataFrame {
self.lazy().unnest(names)?.collect()
}

pub fn partition_by(&self, by: Robj, maintain_order: Robj, include_key: Robj) -> RResult<List> {
let by = robj_to!(Vec, String, by)?;
let maintain_order = robj_to!(bool, maintain_order)?;
let include_key = robj_to!(bool, include_key)?;
let out = if maintain_order {
self.0.clone().partition_by_stable(by, include_key)
} else {
self.0.partition_by(by, include_key)
}
.map_err(polars_to_rpolars_err)?;

let vec = unsafe { std::mem::transmute::<Vec<pl::DataFrame>, Vec<RPolarsDataFrame>>(out) };
Ok(List::from_values(vec))
}

pub fn export_stream(&self, stream_ptr: &str) {
let schema = self.0.schema().to_arrow(false);
let data_type = ArrowDataType::Struct(schema.fields);
Expand Down
Loading

0 comments on commit 0af2cf7

Please sign in to comment.