Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: POSIXct without timezone and naive time handling #878

Merged
merged 17 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,77 @@
(either lexical or physical). This also means that calling `pl$Categorical`
doesn't create a `DataType` anymore. All calls to `pl$Categorical` must be
replaced by `pl$Categorical()` (#860).
- The conversion strategy between the POSIXct type without time zone attribute
and Polars datetime has been changed (#878).
`POSIXct` class vectors without a time zone attribute have UTC time internally
and is displayed based on the system's time zone. Previous versions of `polars`
only considered the internal value and interpreted it as UTC time, so the
time displayed as `POSIXct` and in Polars was different.

```r
# polars 0.14.1
Sys.setenv(TZ = "Europe/Paris")
datetime = as.POSIXct("1900-01-01")
datetime
#> [1] "1900-01-01 PMT"

s = polars::as_polars_series(datetime)
s
#> polars Series: shape: (1,)
#> Series: '' [datetime[ms]]
#> [
#> 1899-12-31 23:50:39
#> ]

as.vector(s)
#> [1] "1900-01-01 PMT"
```

Now the internal value is updated to match the displayed value.

```r
# polars 0.15.0
Sys.setenv(TZ = "Europe/Paris")
datetime = as.POSIXct("1900-01-01")
datetime
#> [1] "1900-01-01 PMT"

s = polars::as_polars_series(datetime)
s
#> polars Series: shape: (1,)
#> Series: '' [datetime[ms]]
#> [
#> 1900-01-01 00:00:00
#> ]

as.vector(s)
#> [1] "1900-01-01 PMT"
```

This update may cause errors when converting from Polars to `POSIXct` for non-existent
or ambiguous times. It is recommended to explicitly add a time zone before converting
from Polars to R.

```r
Sys.setenv(TZ = "America/New_York")
ambiguous_time = as.POSIXct("2020-11-01 01:00:00")
ambiguous_time
#> [1] "2020-11-01 01:00:00 EDT"

pls = polars::as_polars_series(ambiguous_time)
pls
#> polars Series: shape: (1,)
#> Series: '' [datetime[ms]]
#> [
#> 2020-11-01 01:00:00
#> ]

## This will be error!
# pls |> as.vector()

pls$dt$replace_time_zone("UTC") |> as.vector()
#> [1] "2020-11-01 01:00:00 UTC"
```
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved

### New features

Expand Down
43 changes: 39 additions & 4 deletions src/rust/src/conversion_r_to_s.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::robj_to;
use crate::series::RPolarsSeries;
use crate::utils::collect_hinted_result;
use extendr_api::prelude::*;
Expand All @@ -6,6 +7,8 @@ use extendr_api::prelude::*;
use polars::prelude as pl;
use polars::prelude::IntoSeries;
use polars::prelude::NamedFrom;
use polars_lazy::dsl::col;
use polars_lazy::frame::IntoLazy;
// Internal tree structure to contain Series of fully parsed nested Robject.
// It is easier to resolve concatenated datatype after all elements have been parsed
// because empty lists have no type in R, but the corrosponding polars type must be known before
Expand Down Expand Up @@ -210,10 +213,42 @@ fn recursive_robjname2series_tree(x: &Robj, name: &str) -> pl::PolarsResult<Seri
})
.flatten();
//todo this could probably in fewer allocations
let dt = pl::DataType::Datetime(pl::TimeUnit::Milliseconds, tz);
Ok(SeriesTree::Series(
((s * 1000f64).cast(&pl::DataType::Int64)?).cast(&dt)?,
))
match tz {
// zoned time
Some(tz) => {
Ok(SeriesTree::Series(
(s * 1_000f64).cast(&pl::DataType::Int64)?.cast(
&pl::DataType::Datetime(pl::TimeUnit::Milliseconds, Some(tz)),
)?,
))
}
// sys time
None => {
let sys_tz_robj = R!("Sys.timezone()")
.map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?;
let sys_tz = robj_to!(String, sys_tz_robj)
.map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?;
let s_name = s.name();
let utc_s = (s.clone() * 1_000f64).cast(&pl::DataType::Int64)?.cast(
&pl::DataType::Datetime(
pl::TimeUnit::Milliseconds,
Some("UTC".to_string()),
),
)?;
Ok(SeriesTree::Series(
pl::DataFrame::new(vec![utc_s.clone()])?
.lazy()
.select([col(s_name)
.dt()
.convert_time_zone(sys_tz)
.dt()
.replace_time_zone(None, pl::lit("raise"))])
.collect()?
.column(s_name)?
.clone(),
))
}
}
}
Ok(SeriesTree::Series(s)) if x.inherits("Date") => {
Ok(SeriesTree::Series(s.cast(&pl::DataType::Date)?))
Expand Down
35 changes: 30 additions & 5 deletions src/rust/src/conversion_s_to_r.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use crate::rdataframe::RPolarsDataFrame;
use crate::{rdataframe::RPolarsDataFrame, robj_to};
use extendr_api::prelude::*;
use pl::PolarsError as pl_error;
use polars::prelude::{self as pl};
use polars_core::datatypes::DataType;
use polars_lazy::{dsl::col, frame::IntoLazy};

//TODO throw a warning if i32 contains a lowerbound value which is the NA in R.
pub fn pl_series_to_list(
Expand Down Expand Up @@ -212,9 +213,31 @@ pub fn pl_series_to_list(
pl::TimeUnit::Milliseconds => 1_000.0,
};

//resolve timezone
let tz = opt_tz.as_ref().map(|s| s.as_str()).unwrap_or("");
s.cast(&Float64)?
let zoned_s: pl::Series = match opt_tz {
Some(_tz) => {
// zoned time
s.clone()
}
None => {
// naive time
let sys_tz_robj = R!("Sys.timezone()")
.map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?;
let sys_tz = robj_to!(String, sys_tz_robj)
.map_err(|err| pl::PolarsError::ComputeError(err.to_string().into()))?;
let s_name = s.name();
pl::DataFrame::new(vec![s.clone()])?
.lazy()
.select([col(s_name)
.dt()
.replace_time_zone(Some(sys_tz), pl::lit("raise"))])
.collect()?
.column(s_name)?
.clone()
}
};

zoned_s
.cast(&Float64)?
.f64()
.map(|ca| {
ca.into_iter()
Expand All @@ -226,7 +249,9 @@ pub fn pl_series_to_list(
robj.set_class(&["POSIXct", "POSIXt"])
.expect("internal error: class POSIXct label failed")
})
.map(|mut robj| robj.set_attrib("tzone", tz))
.map(|mut robj| {
robj.set_attrib("tzone", opt_tz.as_ref().map(|s| s.as_str()).unwrap_or(""))
})
.expect("internal error: attr tzone failed")
.map_err(|err| {
pl_error::ComputeError(
Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-as_polars.R
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,20 @@ patrick::with_parameters_test_that("clock package class support",
as.POSIXct(as.vector(pl_sys_time), tz = "Asia/Kolkata"),
as.POSIXct(clock_sys_time, tz = "Asia/Kolkata")
)

# Test on other time zone
withr::with_envvar(
new = c(TZ = "Europe/Paris"),
{
expect_equal(as.POSIXct(as.vector(pl_naive_time)), as.POSIXct(clock_naive_time))
expect_equal(as.POSIXct(as.vector(pl_zoned_time_1)), as.POSIXct(clock_zoned_time_1))
expect_equal(as.POSIXct(as.vector(pl_sys_time)), as.POSIXct(clock_sys_time, tz = "UTC"))
expect_equal(
as.POSIXct(as.vector(pl_sys_time), tz = "Asia/Kolkata"),
as.POSIXct(clock_sys_time, tz = "Asia/Kolkata")
)
}
)
},
precision = c("nanosecond", "microsecond", "millisecond", "second", "minute", "hour", "day"),
.test_name = precision
Expand Down
20 changes: 20 additions & 0 deletions tests/testthat/test-datatype.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,26 @@ test_that("POSIXct data conversion", {
as.POSIXct("2022-01-01")
)

withr::with_envvar(
new = c(TZ = "America/New_York"),
{
non_exsitent_time_chr = "2020-03-08 02:00:00"
ambiguous_time_chr = "2020-11-01 01:00:00"
expect_identical(
pl$lit(as.POSIXct(non_exsitent_time_chr))$to_r(),
as.POSIXct(non_exsitent_time_chr)
)
expect_error(
pl$lit(non_exsitent_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(),
"non-existent"
)
expect_error(
pl$lit(ambiguous_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(),
"ambiguous"
)
}
)

expect_identical(
pl$lit(as.POSIXct("2022-01-01", tz = "GMT"))$to_r(),
as.POSIXct("2022-01-01", tz = "GMT")
Expand Down
49 changes: 22 additions & 27 deletions tests/testthat/test-expr_datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ test_that("pl$date_range", {
)
expect_identical(
pl$date_range(start = t1, end = t2, interval = "6h", time_zone = "GMT")$to_r(),
seq(t1, t2, by = as.difftime(6, units = "hours")) |> "attr<-"("tzone", "GMT")
seq(
as.POSIXct("2022-01-01", tz = "GMT"),
as.POSIXct("2022-01-02", tz = "GMT"),
by = as.difftime(6, units = "hours")
)
)
expect_identical(
pl$date_range(start = t1, end = t2, interval = "3h", time_unit = "ms")$to_r(),
Expand Down Expand Up @@ -134,32 +138,23 @@ test_that("pl$date_range lazy ", {


test_that("pl$date_range Date lazy/eager", {
r_vers = paste(unlist(R.version[c("major", "minor")]), collapse = ".")
if (r_vers >= "4.3.0") {
d1 = as.Date("2022-01-01")
s_d = pl$Series(d1, name = "Date")
s_dt = pl$Series(as.POSIXct(d1), name = "Date") # since R4.3 this becomes UTC timezone
df = pl$DataFrame(Date = d1)$to_series()
dr_e = pl$date_range(d1, d1 + 1, interval = "6h")
dr_l = pl$date_range(d1, d1 + 1, interval = "6h", eager = FALSE)
expect_identical(as.POSIXct(s_d$to_r()) |> "attr<-"("tzone", "UTC"), s_dt$to_r())
expect_identical(d1, s_d$to_r())
expect_identical(d1, df$to_r())
expect_identical(s_dt$to_r(), dr_e$to_r()[1] |> "attr<-"("tzone", "UTC"))
expect_identical(s_dt$to_r(), dr_l$to_r()[1] |> "attr<-"("tzone", "UTC"))
} else {
d1 = as.Date("2022-01-01")
s_d = pl$Series(d1, name = "Date")
s_dt = pl$Series(as.POSIXct(d1), name = "Date")
df = pl$DataFrame(Date = d1)$to_series()
dr_e = pl$date_range(d1, d1 + 1, interval = "6h")
dr_l = pl$date_range(d1, d1 + 1, interval = "6h", eager = FALSE)
expect_identical(as.POSIXct(s_d$to_r()) |> "attr<-"("tzone", ""), s_dt$to_r())
expect_identical(d1, s_d$to_r())
expect_identical(d1, df$to_r())
expect_identical(s_dt$to_r(), dr_e$to_r()[1])
expect_identical(s_dt$to_r(), dr_l$to_r()[1])
}
d_chr = "2022-01-01"
d_plus1_chr = "2022-01-02"
d_date = as.Date(d_chr)
s_d = pl$Series(d_date)
s_dt = pl$Series(as.POSIXct(d_chr))
df = pl$DataFrame(Date = d_date)$to_series()

dr_e = pl$date_range(d_date, d_date + 1, interval = "6h", eager = TRUE)
dr_l = pl$date_range(d_date, d_date + 1, interval = "6h", eager = FALSE)

expect_identical(dr_e$to_r()[1], s_dt$to_r())
expect_identical(rev(dr_e$to_r())[1], as.POSIXct(d_plus1_chr))
expect_identical(dr_e$len(), 5)

expect_identical(dr_l$to_r()[1], s_dt$to_r())
expect_identical(rev(dr_l$to_r())[1], as.POSIXct(d_plus1_chr))
expect_identical(dr_l$to_series()$len(), 5)
})


Expand Down
Loading