Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add interlacer support via read_interlaced_resource() #213

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Imports:
cli,
dplyr,
httr,
interlacer (>= 0.3.2),
jsonlite,
purrr,
readr (>= 2.1.0),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export(create_package)
export(create_schema)
export(get_schema)
export(problems)
export(read_interlaced_resource)
export(read_package)
export(read_resource)
export(remove_resource)
Expand Down
1 change: 1 addition & 0 deletions R/frictionless-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
"_PACKAGE"

#' @import rlang

NULL
259 changes: 190 additions & 69 deletions R/read_resource.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#' @param col_select Character vector of the columns to include in the result,
#' in the order provided.
#' Selecting columns can improve read speed.
#' @param interlaced Boolean value indicating if interlaced columns should
#' be loaded using the interlacer package.
#' @param ... arguments to pass to `read_resource()``
#' @return [tibble()] data frame with the Data Resource's tabular data.
#' If there are parsing problems, a warning will alert you.
#' You can retrieve the full details by calling [problems()] on your data
Expand Down Expand Up @@ -198,7 +201,8 @@
#'
#' # Read data from the resource "deployments" with column selection
#' read_resource(package, "deployments", col_select = c("latitude", "longitude"))
read_resource <- function(package, resource_name, col_select = NULL) {
read_resource <- function(package, resource_name, col_select = NULL,
interlaced = FALSE) {
# Get resource, includes check_package()
resource <- get_resource(package, resource_name)

Expand Down Expand Up @@ -261,77 +265,27 @@ read_resource <- function(package, resource_name, col_select = NULL) {
)

# Create col_types: list(<collector_character>, <collector_logical>, ...)
col_types <- purrr::map(fields, function(x) {
type <- x$type %||% NA_character_
enum <- x$constraints$enum
group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
format <- x$format %||% "default" # Undefined => default

# Assign types and formats
col_type <- switch(type,
"string" = if (length(enum) > 0) {
readr::col_factor(levels = enum)
} else {
readr::col_character()
},
"number" = if (length(enum) > 0) {
readr::col_factor(levels = as.character(enum))
} else if (group_char) {
readr::col_number() # Supports grouping_mark
} else if (bare_number) {
readr::col_double() # Allows NaN, INF, -INF
} else {
readr::col_number() # Strips non-num. chars, uses default grouping_mark
},
"integer" = if (length(enum) > 0) {
readr::col_factor(levels = as.character(enum))
} else if (bare_number) {
readr::col_double() # Not col_integer() to avoid big integers issues
} else {
readr::col_number() # Strips non-numeric chars
},
"boolean" = readr::col_logical(),
"object" = readr::col_character(),
"array" = readr::col_character(),
"date" = readr::col_date(format = switch(format,
"default" = "%Y-%m-%d", # ISO
"any" = "%AD", # YMD
"%x" = "%m/%d/%y", # Python strptime for %x
format # Default
)),
"time" = readr::col_time(format = switch(format,
"default" = "%AT", # H(MS)
"any" = "%AT", # H(MS)
"%X" = "%H:%M:%S", # HMS
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"datetime" = readr::col_datetime(format = switch(format,
"default" = "", # ISO (lenient)
"any" = "", # ISO (lenient)
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"year" = readr::col_date(format = "%Y"),
"yearmonth" = readr::col_date(format = "%Y-%m"),
"duration" = readr::col_character(),
"geopoint" = readr::col_character(),
"geojson" = readr::col_character(),
"any" = readr::col_character()
)
# col_type will be NULL when type is undefined (NA_character_) or an
# unrecognized value (e.g. "datum", but will be blocked by check_schema()).
# Set those to col_guess().
col_type <- col_type %||% readr::col_guess()
col_type
})

# Assign names: list("name1" = <collector_character>, "name2" = ...)
names(col_types) <- field_names
col_types <- interlacer::as.x_col_spec(
purrr::map(set_names(fields, field_names), field_to_x_col)
)

# Select CSV dialect, see https://specs.frictionlessdata.io/csv-dialect/
# Note that dialect can be NULL
dialect <- read_descriptor(resource$dialect, package$directory, safe = TRUE)

na_col <- missing_values_to_na_col(
schema$missingValues %||% "",
.name = "default"
)

if (interlaced) {
read_fn = interlacer::read_interlaced_delim
na <- na_col
} else {
read_fn = readr::read_delim
na <- na_col$chr_values
}

# Read data directly
if (resource$read_from == "df") {
df <- dplyr::as_tibble(resource$data)
Expand All @@ -344,7 +298,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
} else if (resource$read_from == "path" || resource$read_from == "url") {
dataframes <- list()
for (i in seq_along(paths)) {
data <- readr::read_delim(
data <- read_fn(
file = paths[i],
delim = dialect$delimiter %||% ",",
quote = dialect$quoteChar %||% "\"",
Expand All @@ -365,7 +319,7 @@ read_resource <- function(package, resource_name, col_select = NULL) {
# a column, see https://rlang.r-lib.org/reference/topic-data-mask.html
col_select = {{col_select}},
locale = locale,
na = schema$missingValues %||% "",
na = na,
comment = dialect$commentChar %||% "",
trim_ws = dialect$skipInitialSpace %||% FALSE,
# Skip header row when present
Expand All @@ -380,3 +334,170 @@ read_resource <- function(package, resource_name, col_select = NULL) {

return(df)
}

#' @rdname read_resource
#' @export
read_interlaced_resource <- function(...) {
read_resource(..., interlaced = TRUE)
}

field_to_x_col <- function(x) {
interlacer::x_col(
field_to_v_col(x),
missing_values_to_na_col(x$missingValues, .name = x$name)
)
}

field_to_v_col <- function(x) {
type <- x$type %||% NA_character_
categories <- x$categories
categoriesOrdered <- x$categoriesOrdered %||% FALSE
group_char <- if (x$groupChar %||% "" != "") TRUE else FALSE
bare_number <- if (x$bareNumber %||% "" != FALSE) TRUE else FALSE
format <- x$format %||% "default" # Undefined => default

# Assign types and formats
col_type <- switch(type,
"string" = if (!is.null(categories)) {
categories_to_v_col(categories, categoriesOrdered, .name = x$name)
} else {
readr::col_character()
},
"number" = if (group_char) {
readr::col_number() # Supports grouping_mark
} else if (bare_number) {
readr::col_double() # Allows NaN, INF, -INF
} else {
readr::col_number() # Strips non-num. chars, uses default grouping_mark
},
"integer" = if (!is.null(categories)) {
categories_to_v_col(categories, categoriesOrdered, .name = x$name)
} else if (bare_number) {
readr::col_double() # Not col_integer() to avoid big integers issues
} else {
readr::col_number() # Strips non-numeric chars
},
"boolean" = readr::col_logical(),
"object" = readr::col_character(),
"array" = readr::col_character(),
"date" = readr::col_date(format = switch(format,
"default" = "%Y-%m-%d", # ISO
"any" = "%AD", # YMD
"%x" = "%m/%d/%y", # Python strptime for %x
format # Default
)),
"time" = readr::col_time(format = switch(format,
"default" = "%AT", # H(MS)
"any" = "%AT", # H(MS)
"%X" = "%H:%M:%S", # HMS
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"datetime" = readr::col_datetime(format = switch(format,
"default" = "", # ISO (lenient)
"any" = "", # ISO (lenient)
sub("%S.%f", "%OS", format) # Default, use %OS for milli/microseconds
)),
"year" = readr::col_date(format = "%Y"),
"yearmonth" = readr::col_date(format = "%Y-%m"),
"duration" = readr::col_character(),
"geopoint" = readr::col_character(),
"geojson" = readr::col_character(),
"any" = readr::col_character()
)
# col_type will be NULL when type is undefined (NA_character_) or an
# unrecognized value (e.g. "datum", but will be blocked by check_schema()).
# Set those to col_guess().
col_type <- col_type %||% readr::col_guess()
col_type
}

categories_to_v_col <- function(categories, categoriesOrdered, .name) {
if (is.character(categories) || is.numeric(categories)) {
readr::col_factor(
levels = as.character(categories),
ordered = categoriesOrdered
)
} else if (is.list(categories)) {
have_value_prop <- purrr::map_lgl(categories, \(v) !is.null(v$value))

if (!all(have_value_prop)) {
cli::cli_abort(
"categories for {.name} is missing a `value` property"
)
}

values <- purrr::map_vec(categories, \(v) v$value)

labels <- purrr::map_chr(
seq_along(categories),
\(i) categories[[i]]$label %||% categories[[i]]$value
)

if (all(values == labels)) {
readr::col_factor(
levels = as.character(values),
ordered = categoriesOrdered
)
} else {
interlacer::v_col_cfactor(
codes = set_names(values, labels),
ordered = categoriesOrdered
)
}
}
}

missing_values_to_na_col <- function(missing_values, .name) {
if (is.null(missing_values)) {
interlacer::na_col_default()
} else if (is_numeric_coercible(missing_values)) {
inject(interlacer::na_col_integer(!!!as.integer(missing_values)))
} else if (is.character(missing_values)) {
inject(interlacer::na_col_factor(!!!missing_values))
} else if (is.list(missing_values)) {
if (length(missing_values) == 0) {
interlacer::na_col_none()
} else {
have_value_prop <- purrr::map_lgl(missing_values, \(v) !is.null(v$value))

if (!all(have_value_prop)) {
cli::cli_abort(
"missingValues for {.name} is missing a `value` property"
)
}

values <- purrr::map_chr(missing_values, \(v) v$value)

if (is_numeric_coercible(values)) {
values <- as.numeric(values)
}

labels <- purrr::map_chr(
seq_along(missing_values),
\(i) missing_values[[i]]$label %||% missing_values[[i]]$value
)

if (all(values == labels)) {
readr::col_factor(levels = values)
} else {
inject(interlacer::na_col_cfactor(!!!set_names(values, labels)))
}
}
} else {
cli::cli_abort(
"Cannot process missingValues for {.name}; expected list of strings or objects"
)
}
}

is_numeric_coercible <- function(x) {
if (is.numeric(x)) {
TRUE
} else if (is.character(x)) {
n_na <- sum(is.na(x))
conv <- suppressWarnings(as.numeric(x))
n_na == sum(is.na(conv))
} else {
FALSE
}
}
10 changes: 9 additions & 1 deletion man/read_resource.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions tests/testthat/data/type_fct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fct_chr,fct_int,ord_chr,ord_int
red,10,red,10
red,20,red,20
green,10,green,10
blue,10,blue,10
6 changes: 6 additions & 0 deletions tests/testthat/data/type_interlaced.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
na_fct,na_int,na_cfct,na_none,na_default,cfct_chr,cfct_int
1.1,1.1,1.1,1,1,a,10
2.2,2.2,2.2,2,2,b,20
OMITTED,-99,-99,3,OMITTED,a,10
REFUSED,-98,-98,3,3,b,20
5.5,5.5,5.5,4,4,a,10
Loading
Loading