diff --git a/DESCRIPTION b/DESCRIPTION index 17cfef41..be26d8f6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Imports: cli, dplyr, httr, + interlacer (>= 0.3.2), jsonlite, purrr, readr (>= 2.1.0), diff --git a/NAMESPACE b/NAMESPACE index 49c95d69..1c541eaf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(create_package) export(create_schema) export(get_schema) export(problems) +export(read_interlaced_resource) export(read_package) export(read_resource) export(remove_resource) diff --git a/R/frictionless-package.R b/R/frictionless-package.R index 9b11d6ad..15894a7d 100644 --- a/R/frictionless-package.R +++ b/R/frictionless-package.R @@ -2,4 +2,5 @@ "_PACKAGE" #' @import rlang + NULL diff --git a/R/read_resource.R b/R/read_resource.R index e460b437..13cef96a 100644 --- a/R/read_resource.R +++ b/R/read_resource.R @@ -16,6 +16,9 @@ #' @param col_select Character vector of the columns to include in the result, #' in the order provided. #' Selecting columns can improve read speed. +#' @param interlaced Boolean value indicating if interlaced columns should +#' be loaded using the interlacer package. +#' @param ... arguments to pass to `read_resource()`` #' @return [tibble()] data frame with the Data Resource's tabular data. #' If there are parsing problems, a warning will alert you. #' You can retrieve the full details by calling [problems()] on your data @@ -198,7 +201,8 @@ #' #' # Read data from the resource "deployments" with column selection #' read_resource(package, "deployments", col_select = c("latitude", "longitude")) -read_resource <- function(package, resource_name, col_select = NULL) { +read_resource <- function(package, resource_name, col_select = NULL, + interlaced = FALSE) { # Get resource, includes check_package() resource <- get_resource(package, resource_name) @@ -261,77 +265,27 @@ read_resource <- function(package, resource_name, col_select = NULL) { ) # Create col_types: list(, , ...) - col_types <- purrr::map(fields, function(x) { - type <- x$type %||% NA_character_ - enum <- x$constraints$enum - group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE - bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE - format <- x$format %||% "default" # Undefined => default - - # Assign types and formats - col_type <- switch(type, - "string" = if (length(enum) > 0) { - readr::col_factor(levels = enum) - } else { - readr::col_character() - }, - "number" = if (length(enum) > 0) { - readr::col_factor(levels = as.character(enum)) - } else if (group_char) { - readr::col_number() # Supports grouping_mark - } else if (bare_number) { - readr::col_double() # Allows NaN, INF, -INF - } else { - readr::col_number() # Strips non-num. chars, uses default grouping_mark - }, - "integer" = if (length(enum) > 0) { - readr::col_factor(levels = as.character(enum)) - } else if (bare_number) { - readr::col_double() # Not col_integer() to avoid big integers issues - } else { - readr::col_number() # Strips non-numeric chars - }, - "boolean" = readr::col_logical(), - "object" = readr::col_character(), - "array" = readr::col_character(), - "date" = readr::col_date(format = switch(format, - "default" = "%Y-%m-%d", # ISO - "any" = "%AD", # YMD - "%x" = "%m/%d/%y", # Python strptime for %x - format # Default - )), - "time" = readr::col_time(format = switch(format, - "default" = "%AT", # H(MS) - "any" = "%AT", # H(MS) - "%X" = "%H:%M:%S", # HMS - sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds - )), - "datetime" = readr::col_datetime(format = switch(format, - "default" = "", # ISO (lenient) - "any" = "", # ISO (lenient) - sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds - )), - "year" = readr::col_date(format = "%Y"), - "yearmonth" = readr::col_date(format = "%Y-%m"), - "duration" = readr::col_character(), - "geopoint" = readr::col_character(), - "geojson" = readr::col_character(), - "any" = readr::col_character() - ) - # col_type will be NULL when type is undefined (NA_character_) or an - # unrecognized value (e.g. "datum", but will be blocked by check_schema()). - # Set those to col_guess(). - col_type <- col_type %||% readr::col_guess() - col_type - }) - - # Assign names: list("name1" = , "name2" = ...) - names(col_types) <- field_names + col_types <- interlacer::as.x_col_spec( + purrr::map(set_names(fields, field_names), field_to_x_col) + ) # Select CSV dialect, see https://specs.frictionlessdata.io/csv-dialect/ # Note that dialect can be NULL dialect <- read_descriptor(resource$dialect, package$directory, safe = TRUE) + na_col <- missing_values_to_na_col( + schema$missingValues %||% "", + .name = "default" + ) + + if (interlaced) { + read_fn = interlacer::read_interlaced_delim + na <- na_col + } else { + read_fn = readr::read_delim + na <- na_col$chr_values + } + # Read data directly if (resource$read_from == "df") { df <- dplyr::as_tibble(resource$data) @@ -344,7 +298,7 @@ read_resource <- function(package, resource_name, col_select = NULL) { } else if (resource$read_from == "path" || resource$read_from == "url") { dataframes <- list() for (i in seq_along(paths)) { - data <- readr::read_delim( + data <- read_fn( file = paths[i], delim = dialect$delimiter %||% ",", quote = dialect$quoteChar %||% "\"", @@ -365,7 +319,7 @@ read_resource <- function(package, resource_name, col_select = NULL) { # a column, see https://rlang.r-lib.org/reference/topic-data-mask.html col_select = {{col_select}}, locale = locale, - na = schema$missingValues %||% "", + na = na, comment = dialect$commentChar %||% "", trim_ws = dialect$skipInitialSpace %||% FALSE, # Skip header row when present @@ -380,3 +334,170 @@ read_resource <- function(package, resource_name, col_select = NULL) { return(df) } + +#' @rdname read_resource +#' @export +read_interlaced_resource <- function(...) { + read_resource(..., interlaced = TRUE) +} + +field_to_x_col <- function(x) { + interlacer::x_col( + field_to_v_col(x), + missing_values_to_na_col(x$missingValues, .name = x$name) + ) +} + +field_to_v_col <- function(x) { + type <- x$type %||% NA_character_ + categories <- x$categories + categoriesOrdered <- x$categoriesOrdered %||% FALSE + group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE + bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE + format <- x$format %||% "default" # Undefined => default + + # Assign types and formats + col_type <- switch(type, + "string" = if (!is.null(categories)) { + categories_to_v_col(categories, categoriesOrdered, .name = x$name) + } else { + readr::col_character() + }, + "number" = if (group_char) { + readr::col_number() # Supports grouping_mark + } else if (bare_number) { + readr::col_double() # Allows NaN, INF, -INF + } else { + readr::col_number() # Strips non-num. chars, uses default grouping_mark + }, + "integer" = if (!is.null(categories)) { + categories_to_v_col(categories, categoriesOrdered, .name = x$name) + } else if (bare_number) { + readr::col_double() # Not col_integer() to avoid big integers issues + } else { + readr::col_number() # Strips non-numeric chars + }, + "boolean" = readr::col_logical(), + "object" = readr::col_character(), + "array" = readr::col_character(), + "date" = readr::col_date(format = switch(format, + "default" = "%Y-%m-%d", # ISO + "any" = "%AD", # YMD + "%x" = "%m/%d/%y", # Python strptime for %x + format # Default + )), + "time" = readr::col_time(format = switch(format, + "default" = "%AT", # H(MS) + "any" = "%AT", # H(MS) + "%X" = "%H:%M:%S", # HMS + sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds + )), + "datetime" = readr::col_datetime(format = switch(format, + "default" = "", # ISO (lenient) + "any" = "", # ISO (lenient) + sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds + )), + "year" = readr::col_date(format = "%Y"), + "yearmonth" = readr::col_date(format = "%Y-%m"), + "duration" = readr::col_character(), + "geopoint" = readr::col_character(), + "geojson" = readr::col_character(), + "any" = readr::col_character() + ) + # col_type will be NULL when type is undefined (NA_character_) or an + # unrecognized value (e.g. "datum", but will be blocked by check_schema()). + # Set those to col_guess(). + col_type <- col_type %||% readr::col_guess() + col_type +} + +categories_to_v_col <- function(categories, categoriesOrdered, .name) { + if (is.character(categories) || is.numeric(categories)) { + readr::col_factor( + levels = as.character(categories), + ordered = categoriesOrdered + ) + } else if (is.list(categories)) { + have_value_prop <- purrr::map_lgl(categories, \(v) !is.null(v$value)) + + if (!all(have_value_prop)) { + cli::cli_abort( + "categories for {.name} is missing a `value` property" + ) + } + + values <- purrr::map_vec(categories, \(v) v$value) + + labels <- purrr::map_chr( + seq_along(categories), + \(i) categories[[i]]$label %||% categories[[i]]$value + ) + + if (all(values == labels)) { + readr::col_factor( + levels = as.character(values), + ordered = categoriesOrdered + ) + } else { + interlacer::v_col_cfactor( + codes = set_names(values, labels), + ordered = categoriesOrdered + ) + } + } +} + +missing_values_to_na_col <- function(missing_values, .name) { + if (is.null(missing_values)) { + interlacer::na_col_default() + } else if (is_numeric_coercible(missing_values)) { + inject(interlacer::na_col_integer(!!!as.integer(missing_values))) + } else if (is.character(missing_values)) { + inject(interlacer::na_col_factor(!!!missing_values)) + } else if (is.list(missing_values)) { + if (length(missing_values) == 0) { + interlacer::na_col_none() + } else { + have_value_prop <- purrr::map_lgl(missing_values, \(v) !is.null(v$value)) + + if (!all(have_value_prop)) { + cli::cli_abort( + "missingValues for {.name} is missing a `value` property" + ) + } + + values <- purrr::map_chr(missing_values, \(v) v$value) + + if (is_numeric_coercible(values)) { + values <- as.numeric(values) + } + + labels <- purrr::map_chr( + seq_along(missing_values), + \(i) missing_values[[i]]$label %||% missing_values[[i]]$value + ) + + if (all(values == labels)) { + readr::col_factor(levels = values) + } else { + inject(interlacer::na_col_cfactor(!!!set_names(values, labels))) + } + } + } else { + cli::cli_abort( + "Cannot process missingValues for {.name}; expected list of strings or objects" + ) + } +} + +is_numeric_coercible <- function(x) { + if (is.numeric(x)) { + TRUE + } else if (is.character(x)) { + n_na <- sum(is.na(x)) + conv <- suppressWarnings(as.numeric(x)) + n_na == sum(is.na(conv)) + } else { + FALSE + } +} diff --git a/man/read_resource.Rd b/man/read_resource.Rd index ec9c34a6..fc0da522 100644 --- a/man/read_resource.Rd +++ b/man/read_resource.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/read_resource.R \name{read_resource} \alias{read_resource} +\alias{read_interlaced_resource} \title{Read data from a Data Resource into a tibble data frame} \usage{ -read_resource(package, resource_name, col_select = NULL) +read_resource(package, resource_name, col_select = NULL, interlaced = FALSE) + +read_interlaced_resource(...) } \arguments{ \item{package}{Data Package object, created with \code{\link[=read_package]{read_package()}} or @@ -15,6 +18,11 @@ read_resource(package, resource_name, col_select = NULL) \item{col_select}{Character vector of the columns to include in the result, in the order provided. Selecting columns can improve read speed.} + +\item{interlaced}{Boolean value indicating if interlaced columns should +be loaded using the interlacer package.} + +\item{...}{arguments to pass to `read_resource()``} } \value{ \code{\link[=tibble]{tibble()}} data frame with the Data Resource's tabular data. diff --git a/tests/testthat/data/type_fct.csv b/tests/testthat/data/type_fct.csv new file mode 100644 index 00000000..c3a6a95c --- /dev/null +++ b/tests/testthat/data/type_fct.csv @@ -0,0 +1,5 @@ +fct_chr,fct_int,ord_chr,ord_int +red,10,red,10 +red,20,red,20 +green,10,green,10 +blue,10,blue,10 diff --git a/tests/testthat/data/type_interlaced.csv b/tests/testthat/data/type_interlaced.csv new file mode 100644 index 00000000..7d64806f --- /dev/null +++ b/tests/testthat/data/type_interlaced.csv @@ -0,0 +1,6 @@ +na_fct,na_int,na_cfct,na_none,na_default,cfct_chr,cfct_int +1.1,1.1,1.1,1,1,a,10 +2.2,2.2,2.2,2,2,b,20 +OMITTED,-99,-99,3,OMITTED,a,10 +REFUSED,-98,-98,3,3,b,20 +5.5,5.5,5.5,4,4,a,10 diff --git a/tests/testthat/data/types.json b/tests/testthat/data/types.json index 5d520e80..93221d71 100644 --- a/tests/testthat/data/types.json +++ b/tests/testthat/data/types.json @@ -13,12 +13,7 @@ { "name": "str_factor", "type": "string", - "constraints": { - "enum": [ - "foo", - "bar" - ] - } + "categories": ["foo", "bar"] } ] } @@ -40,13 +35,7 @@ { "name": "num_factor", "type": "number", - "constraints": { - "enum": [ - 3.1, - 3.2, - 3.3 - ] - } + "notes": "This is deprecated; factors should only be int and string" }, { "name": "num_nan", @@ -94,13 +83,7 @@ { "name": "int_factor", "type": "integer", - "constraints": { - "enum": [ - 3, - 4, - -1 - ] - } + "categories": [3, 4, -1] }, { "name": "int_ws", @@ -294,6 +277,61 @@ } ] } + }, + { + "name": "interlaced", + "path": "type_interlaced.csv", + "profile": "tabular-data-resource", + "schema": { + "missingValues": ["OMITTED"], + "fields": [ + { + "name": "na_fct", + "type": "number", + "missingValues": ["OMITTED", "REFUSED"] + }, + { + "name": "na_int", + "type": "number", + "missingValues": ["-99", "-98"] + }, + { + "name": "na_cfct", + "type": "number", + "missingValues": [ + { "value": "-99", "label": "OMITTED" }, + { "value": "-98", "label": "REFUSED" } + ] + }, + { + "name": "na_none", + "type": "integer", + "missingValues": [] + }, + { + "name": "na_default", + "type": "number" + }, + { + "name": "cfct_chr", + "type": "string", + "categories": [ + { "value": "a", "label": "APPLE" }, + { "value": "b", "label": "BANANA" } + ], + "missingValues": [] + }, + { + "name": "cfct_int", + "type": "integer", + "categories": [ + { "value": 10, "label": "APPLE" }, + { "value": 20, "label": "BANANA" } + ], + "missingValues": [] + } + ] + } } ] } diff --git a/tests/testthat/test-read_resource.R b/tests/testthat/test-read_resource.R index 8d6faec0..c69678eb 100644 --- a/tests/testthat/test-read_resource.R +++ b/tests/testthat/test-read_resource.R @@ -582,10 +582,10 @@ test_that("read_resource() handles strings", { resource <- read_resource(p, "string") expect_type(resource$str, "character") - # Use factor when enum is present - enum <- p$resources[[1]]$schema$fields[[2]]$constraints$enum + # Use factor when categories are present + categories <- p$resources[[1]]$schema$fields[[2]]$categories expect_s3_class(resource$str_factor, "factor") - expect_identical(levels(resource$str_factor), enum) + expect_identical(levels(resource$str_factor), categories) }) test_that("read_resource() handles numbers", { @@ -598,10 +598,8 @@ test_that("read_resource() handles numbers", { expect_type(resource$num_neg, "double") expect_true(all(resource$num_neg == -3)) - # Use factor when enum is present - enum <- p$resources[[2]]$schema$fields[[3]]$constraints$enum - expect_s3_class(resource$num_factor, "factor") - expect_identical(levels(resource$num_factor), as.character(enum)) + # Use factor when enum is present (deprecated) + # TODO: remove num_factor from test file # NaN, INF, -INF are supported, case-insensitive expect_type(resource$num_nan, "double") @@ -631,10 +629,10 @@ test_that("read_resource() handles integers (as doubles)", { expect_type(resource$int_neg, "double") expect_true(all(resource$int_neg == -3)) - # Use factor when enum is present - enum <- p$resources[[3]]$schema$fields[[3]]$constraints$enum + # Use factor when categories are present + categories <- p$resources[[3]]$schema$fields[[3]]$categories expect_s3_class(resource$int_factor, "factor") - expect_identical(levels(resource$int_factor), as.character(enum)) + expect_identical(levels(resource$int_factor), as.character(categories)) # bareNumber = false allows whitespace and non-numeric characters expect_type(resource$int_ws, "double") @@ -738,3 +736,37 @@ test_that("read_resource() handles other types", { # Guess undefined types, unknown types are blocked by check_schema() expect_type(resource$no_type, "logical") }) + +test_that("read_resource() handles interlaced types", { + p <- read_package(test_path("data/types.json")) + resource <- read_interlaced_resource(p, "interlaced") + + # Interpret fct missing reasons + expect_s3_class(interlacer::na_channel(resource$na_fct), "factor") + expect_true(resource$na_fct[[3]] == interlacer::na("OMITTED")) + + # Interpret int missing reasons + expect_type(interlacer::na_channel(resource$na_fct), "integer") + expect_true(resource$na_int[[3]] == interlacer::na(-99)) + + # Interpret cfct missing reasons + expect_s3_class(interlacer::na_channel(resource$na_cfct), "interlacer_cfactor") + expect_true(resource$na_cfct[[3]] == interlacer::na("OMITTED")) + + # Interpret none missing reasons + expect_s3_class(resource$na_none, NA) + + # Interpret default missing reasons + expect_s3_class(interlacer::na_channel(resource$na_default), "factor") + expect_true(resource$na_default[[3]] == interlacer::na("OMITTED")) + + # Interpret cfct_chr + expect_s3_class(resource$cfct_chr, "interlacer_cfactor") + expect_true(resource$cfct_chr[[3]] == "APPLE") + expect_true(interlacer::as.codes(resource$cfct_chr[[3]]) == "a") + + # Interpret cfct_int + expect_s3_class(resource$cfct_int, "interlacer_cfactor") + expect_true(resource$cfct_int[[3]] == "APPLE") + expect_true(interlacer::as.codes(resource$cfct_int[[3]]) == 10) +})