fix: fix R side

pola-rs · Nov 17, 2024 · f8e173a · f8e173a
1 parent f0a6b6f
commit f8e173a
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 29 deletions.
diff --git a/R/expr__expr.R b/R/expr__expr.R
@@ -1758,8 +1758,7 @@ Expr_n_unique = use_extendr_wrapper
 #' This is done using the HyperLogLog++ algorithm for cardinality estimation.
 #' @return Expr
 #' @examples
-#' as_polars_df(iris[, 4:5])$
-#'   with_columns(count = pl$col("Species")$approx_n_unique())
+#' as_polars_df(mtcars)$select(count = pl$col("cyl")$approx_n_unique())
 Expr_approx_n_unique = use_extendr_wrapper
 
 #' Count missing values
@@ -3039,7 +3038,7 @@ Expr_arctanh = use_extendr_wrapper
 #' # One can specify more than 2 dimensions by using the Array type
 #' df = pl$DataFrame(foo = 1:12)
 #' df$select(
-#'   pl$col("foo")$reshape(c(3, 2, 2), nested_type = pl$Array(pl$Float32, 2))
+#'   pl$col("foo")$reshape(c(3, 2, 2))
 #' )
 Expr_reshape = function(dimensions) {
   .pr$Expr$reshape(self, dimensions) |>

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -98,7 +98,7 @@ import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index,
 
 new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors)
 
-new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, hive_schema, try_parse_hive_dates, glob, include_file_paths) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, hive_schema, try_parse_hive_dates, glob, include_file_paths)
+new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, schema, hive_schema, try_parse_hive_dates, glob, include_file_paths, allow_missing_columns) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_index, storage_options, use_statistics, low_memory, hive_partitioning, schema, hive_schema, try_parse_hive_dates, glob, include_file_paths, allow_missing_columns)
 
 test_rpolarserr <- function() .Call(wrap__test_rpolarserr)
 
@@ -680,7 +680,7 @@ RPolarsExpr$arccosh <- function() .Call(wrap__RPolarsExpr__arccosh, self)
 
 RPolarsExpr$arctanh <- function() .Call(wrap__RPolarsExpr__arctanh, self)
 
-RPolarsExpr$reshape <- function(dimensions, is_list) .Call(wrap__RPolarsExpr__reshape, self, dimensions, is_list)
+RPolarsExpr$reshape <- function(dimensions) .Call(wrap__RPolarsExpr__reshape, self, dimensions)
 
 RPolarsExpr$shuffle <- function(seed) .Call(wrap__RPolarsExpr__shuffle, self, seed)
 

diff --git a/R/io_parquet.R b/R/io_parquet.R
@@ -12,11 +12,17 @@
 #' @param rechunk In case of reading multiple files via a glob pattern, rechunk
 #' the final DataFrame into contiguous memory chunks.
 #' @param glob Expand path given via globbing rules.
+#' @param schema Specify the datatypes of the columns. The datatypes must match the datatypes in the file(s).
+#' If there are extra columns that are not in the file(s), consider also enabling `allow_missing_columns`.
 #' @param use_statistics Use statistics in the parquet file to determine if pages
 #' can be skipped from reading.
 #' @param storage_options Experimental. List of options necessary to scan
 #' parquet files from different cloud storage providers (GCP, AWS, Azure,
 #' HuggingFace). See the 'Details' section.
+#' @param allow_missing_columns When reading a list of parquet files, if a column existing in the first
+#' file cannot be found in subsequent files, the default behavior is to raise an error.
+#' However, if `allow_missing_columns` is set to `TRUE`, a full-NULL column is returned
+#' instead of erroring for the files that do not contain the column.
 #'
 #' @rdname IO_scan_parquet
 #' @details
@@ -101,12 +107,14 @@ pl_scan_parquet = function(
     hive_schema = NULL,
     try_parse_hive_dates = TRUE,
     glob = TRUE,
+    schema = NULL,
     rechunk = FALSE,
     low_memory = FALSE,
     storage_options = NULL,
     use_statistics = TRUE,
     cache = TRUE,
-    include_file_paths = NULL) {
+    include_file_paths = NULL,
+    allow_missing_columns = FALSE) {
   new_from_parquet(
     path = source,
     n_rows = n_rows,
@@ -122,7 +130,9 @@ pl_scan_parquet = function(
     try_parse_hive_dates = try_parse_hive_dates,
     storage_options = storage_options,
     glob = glob,
-    include_file_paths = include_file_paths
+    schema = schema,
+    include_file_paths = include_file_paths,
+    allow_missing_columns = allow_missing_columns
   ) |>
     unwrap("in pl$scan_parquet():")
 }
@@ -162,12 +172,14 @@ pl_read_parquet = function(
     hive_schema = NULL,
     try_parse_hive_dates = TRUE,
     glob = TRUE,
+    schema = NULL,
     rechunk = TRUE,
     low_memory = FALSE,
     storage_options = NULL,
     use_statistics = TRUE,
     cache = TRUE,
-    include_file_paths = NULL) {
+    include_file_paths = NULL,
+    allow_missing_columns = FALSE) {
   .args = as.list(environment())
   result({
     do.call(pl$scan_parquet, .args)$collect()

diff --git a/man/Expr_approx_n_unique.Rd b/man/Expr_approx_n_unique.Rd
diff --git a/man/Expr_reshape.Rd b/man/Expr_reshape.Rd
diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd
diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd
diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R
@@ -485,7 +485,7 @@ test_that("automatically rechunked for struct array stream from C stream interfa
     as_polars_series()
 
   expect_identical(s_int_exp$n_chunks(), 2)
-  expect_identical(s_struct_exp$n_chunks(), 1)
+  expect_identical(s_struct_exp$n_chunks(), 2)
   expect_identical(s_struct_stable$n_chunks(), 2)
 })
 

diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R
@@ -1708,18 +1708,15 @@ test_that("Expr_rolling_*_by", {
     )$to_data_frame(),
     expected
   )
-})
-
-test_that("Expr_rolling_*_by only works with date/datetime", {
-  df = pl$DataFrame(a = 1:6, id = 11:16)
 
-  expect_error(
-    df$select(pl$col("a")$rolling_min_by("id", window_size = "2i")),
-    "`by` argument of dtype `i32` is not supported"
+  expect_no_error(
+    pl$DataFrame(a = 1:6, id = 11:16)$select(pl$col("a")$rolling_min_by("id", window_size = "2i")),
   )
+})
 
+test_that("Expr_rolling_*_by error", {
   expect_error(
-    df$select(pl$col("a")$rolling_min_by(1, window_size = "2d")),
+    pl$DataFrame(a = 1:6, id = 11:16)$select(pl$col("a")$rolling_min_by(1, window_size = "2d")),
     "must be the same length as values column"
   )
 })
@@ -2187,12 +2184,12 @@ test_that("reshape", {
   expect_true(
     pl$DataFrame(a = 1:4)$select(
       pl$col("a")$reshape(c(-1, 2))
-    )$dtypes[[1]] == pl$List(pl$Int32)
+    )$dtypes[[1]] == pl$Array(pl$Int32, 2)
   )
 
   # One can specify more than 2 dimensions by using the Array type
   out = pl$DataFrame(foo = 1:12)$select(
-    pl$col("foo")$reshape(c(3, 2, 2), nested_type = pl$Array(pl$Float32, 2))
+    pl$col("foo")$reshape(c(3, 2, 2))
   )
   # annoying to test schema equivalency with list()
   expect_snapshot(out$schema)