Skip to content

Commit

Permalink
fix: fix R side
Browse files Browse the repository at this point in the history
  • Loading branch information
eitsupi committed Nov 17, 2024
1 parent f0a6b6f commit f8e173a
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 29 deletions.
5 changes: 2 additions & 3 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -1758,8 +1758,7 @@ Expr_n_unique = use_extendr_wrapper
#' This is done using the HyperLogLog++ algorithm for cardinality estimation.
#' @return Expr
#' @examples
#' as_polars_df(iris[, 4:5])$
#' with_columns(count = pl$col("Species")$approx_n_unique())
#' as_polars_df(mtcars)$select(count = pl$col("cyl")$approx_n_unique())
Expr_approx_n_unique = use_extendr_wrapper

#' Count missing values
Expand Down Expand Up @@ -3039,7 +3038,7 @@ Expr_arctanh = use_extendr_wrapper
#' # One can specify more than 2 dimensions by using the Array type
#' df = pl$DataFrame(foo = 1:12)
#' df$select(
#' pl$col("foo")$reshape(c(3, 2, 2), nested_type = pl$Array(pl$Float32, 2))
#' pl$col("foo")$reshape(c(3, 2, 2))
#' )
Expr_reshape = function(dimensions) {
.pr$Expr$reshape(self, dimensions) |>
Expand Down
4 changes: 2 additions & 2 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index,

new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors)

new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, hive_schema, try_parse_hive_dates, glob, include_file_paths) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, hive_schema, try_parse_hive_dates, glob, include_file_paths)
new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, schema, hive_schema, try_parse_hive_dates, glob, include_file_paths, allow_missing_columns) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, schema, hive_schema, try_parse_hive_dates, glob, include_file_paths, allow_missing_columns)

test_rpolarserr <- function() .Call(wrap__test_rpolarserr)

Expand Down Expand Up @@ -680,7 +680,7 @@ RPolarsExpr$arccosh <- function() .Call(wrap__RPolarsExpr__arccosh, self)

RPolarsExpr$arctanh <- function() .Call(wrap__RPolarsExpr__arctanh, self)

RPolarsExpr$reshape <- function(dimensions, is_list) .Call(wrap__RPolarsExpr__reshape, self, dimensions, is_list)
RPolarsExpr$reshape <- function(dimensions) .Call(wrap__RPolarsExpr__reshape, self, dimensions)

RPolarsExpr$shuffle <- function(seed) .Call(wrap__RPolarsExpr__shuffle, self, seed)

Expand Down
18 changes: 15 additions & 3 deletions R/io_parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
#' @param rechunk In case of reading multiple files via a glob pattern, rechunk
#' the final DataFrame into contiguous memory chunks.
#' @param glob Expand path given via globbing rules.
#' @param schema Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s).
#' If there are extra columns that are not in the file(s), consider also enabling `allow_missing_columns`.
#' @param use_statistics Use statistics in the parquet file to determine if pages
#' can be skipped from reading.
#' @param storage_options Experimental. List of options necessary to scan
#' parquet files from different cloud storage providers (GCP, AWS, Azure,
#' HuggingFace). See the 'Details' section.
#' @param allow_missing_columns When reading a list of parquet files, if a column existing in the first
#' file cannot be found in subsequent files, the default behavior is to raise an error.
#' However, if `allow_missing_columns` is set to `TRUE`, a full-NULL column is returned
#' instead of erroring for the files that do not contain the column.
#'
#' @rdname IO_scan_parquet
#' @details
Expand Down Expand Up @@ -101,12 +107,14 @@ pl_scan_parquet = function(
hive_schema = NULL,
try_parse_hive_dates = TRUE,
glob = TRUE,
schema = NULL,
rechunk = FALSE,
low_memory = FALSE,
storage_options = NULL,
use_statistics = TRUE,
cache = TRUE,
include_file_paths = NULL) {
include_file_paths = NULL,
allow_missing_columns = FALSE) {
new_from_parquet(
path = source,
n_rows = n_rows,
Expand All @@ -122,7 +130,9 @@ pl_scan_parquet = function(
try_parse_hive_dates = try_parse_hive_dates,
storage_options = storage_options,
glob = glob,
include_file_paths = include_file_paths
schema = schema,
include_file_paths = include_file_paths,
allow_missing_columns = allow_missing_columns
) |>
unwrap("in pl$scan_parquet():")
}
Expand Down Expand Up @@ -162,12 +172,14 @@ pl_read_parquet = function(
hive_schema = NULL,
try_parse_hive_dates = TRUE,
glob = TRUE,
schema = NULL,
rechunk = TRUE,
low_memory = FALSE,
storage_options = NULL,
use_statistics = TRUE,
cache = TRUE,
include_file_paths = NULL) {
include_file_paths = NULL,
allow_missing_columns = FALSE) {
.args = as.list(environment())
result({
do.call(pl$scan_parquet, .args)$collect()
Expand Down
3 changes: 1 addition & 2 deletions man/Expr_approx_n_unique.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 2 additions & 6 deletions man/Expr_reshape.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion man/IO_read_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion man/IO_scan_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-as_polars.R
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ test_that("automatically rechunked for struct array stream from C stream interfa
as_polars_series()

expect_identical(s_int_exp$n_chunks(), 2)
expect_identical(s_struct_exp$n_chunks(), 1)
expect_identical(s_struct_exp$n_chunks(), 2)
expect_identical(s_struct_stable$n_chunks(), 2)
})

Expand Down
17 changes: 7 additions & 10 deletions tests/testthat/test-expr_expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -1708,18 +1708,15 @@ test_that("Expr_rolling_*_by", {
)$to_data_frame(),
expected
)
})

test_that("Expr_rolling_*_by only works with date/datetime", {
df = pl$DataFrame(a = 1:6, id = 11:16)

expect_error(
df$select(pl$col("a")$rolling_min_by("id", window_size = "2i")),
"`by` argument of dtype `i32` is not supported"
expect_no_error(
pl$DataFrame(a = 1:6, id = 11:16)$select(pl$col("a")$rolling_min_by("id", window_size = "2i")),
)
})

test_that("Expr_rolling_*_by error", {
expect_error(
df$select(pl$col("a")$rolling_min_by(1, window_size = "2d")),
pl$DataFrame(a = 1:6, id = 11:16)$select(pl$col("a")$rolling_min_by(1, window_size = "2d")),
"must be the same length as values column"
)
})
Expand Down Expand Up @@ -2187,12 +2184,12 @@ test_that("reshape", {
expect_true(
pl$DataFrame(a = 1:4)$select(
pl$col("a")$reshape(c(-1, 2))
)$dtypes[[1]] == pl$List(pl$Int32)
)$dtypes[[1]] == pl$Array(pl$Int32, 2)
)

# One can specify more than 2 dimensions by using the Array type
out = pl$DataFrame(foo = 1:12)$select(
pl$col("foo")$reshape(c(3, 2, 2), nested_type = pl$Array(pl$Float32, 2))
pl$col("foo")$reshape(c(3, 2, 2))
)
# annoying to test schema equivalency with list()
expect_snapshot(out$schema)
Expand Down

0 comments on commit f8e173a

Please sign in to comment.