From af93ef8b0216bc292e57f8ea394d85780cc11c9c Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Fri, 23 Aug 2024 15:00:37 -0600 Subject: [PATCH 1/8] Increment version number to 0.3.4 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 708637e..a65ea22 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ohcleandat Type: Package Title: One Health Data Cleaning and Quality Checking Package -Version: 0.3.3 +Version: 0.3.4 Authors@R: c( person("Collin", "Schwantes", email = "schwantes@ecohealthalliance.org", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4014-4896")), person("Johana", "Teigen", email = "teigen@ecohealthalliance.org", role = "aut", comment = c(ORCID = "0000-0002-6209-2321")), diff --git a/NEWS.md b/NEWS.md index 9b65d50..3e4427a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# ohcleandat 0.3.4 + # ohcleandat 0.3.3 * Adds more control over the function used in `get_precision` and `obfuscate_gps` From 5f75ab640297d6e94c01e3e2ca6c0919bf5cb020 Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Fri, 23 Aug 2024 16:33:40 -0600 Subject: [PATCH 2/8] adding function for creating structural metadata --- NAMESPACE | 1 + NEWS.md | 2 + R/create_structural_metadata.R | 121 ++++++++++++++++++++++++++++++ man/create_structural_metadata.Rd | 104 +++++++++++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 R/create_structural_metadata.R create mode 100644 man/create_structural_metadata.Rd diff --git a/NAMESPACE b/NAMESPACE index 512612f..a0e58b5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(correct_data) export(create_freetext_log) export(create_questionnaire_log) export(create_rules_from_template) +export(create_structural_metadata) export(create_translation_log) export(create_validation_log) export(detect_language) diff --git a/NEWS.md b/NEWS.md index 3e4427a..f18f360 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # ohcleandat 0.3.4 +* Setups up a minimal structural metadata framework for tabular datasets. + # ohcleandat 0.3.3 * Adds more control over the function used in `get_precision` and `obfuscate_gps` diff --git a/R/create_structural_metadata.R b/R/create_structural_metadata.R new file mode 100644 index 0000000..4c4bcda --- /dev/null +++ b/R/create_structural_metadata.R @@ -0,0 +1,121 @@ +#' Create Structural Metadata from a dataframe +#' +#' This is the metadata that describes the data themselves. This metadata can be +#' generated then joined to pre-existing metadata via field names. +#' +#' +#' @param data Any named object. Expects a table but will work +#' superficially with lists or named vectors. +#' +#' @details +#' +#' The metadata table produced has the following elements +#' +#' `name` = The name of the field. This is taken as is from `data`. +#' `description` = Description of that field. May be provided by controlled vocabulary +#' `units` = Units of measure for that field. May or may not apply +#' `term_uri` = Universal Resource Identifier for a term from a controlled vocabulary or schema +#' `comments` = Free text providing additional details about the field +#' `primary_key` = `TRUE` or `FALSE`, Uniquely identifies each record in the data +#' `foreign_key` = `TRUE` or `FALSE`, Allows for linkages between data sets. Uniquely identifies +#' records in a different data set +#' +#' +#' @return dataframe with standard metadata requirements +#' @export +#' +#' @examples +#' \dontrun{ +#' df <- data.frame(a = 1:10, b = letters[1:10]) +#' df_metadata <- create_structural_metadata(df) +#' write.csv(df_metadata,"df_metadata.csv") +#' +#' # lets pretend we are using a dataset which already has +#' ## in airtable, you can add field descriptions directly +#' ## in the base. We want those exported and properly formatted +#' ## in our ohcleandat workflow +#' +#' base <- "appMyBaseID" +#' table_name <- "My Table" +#' +#' airtable_metadata <- airtabler::air_generate_metadata_from_api(base = base, +#' field_names_to_snake_case = FALSE ) |> +#' dplyr::filter(table_name == {table_name}) |> +#' dplyr::select(field_name,field_desc,primary_key) +#' +#' airtable_df <- airtabler::fetch_all(base = base, table_name = table_name) +#' +#' airtable_df_metadata <- create_structural_metadata(airtable_df) +#' +#' metadata_joined <- dplyr::left_join(airtable_df_metadata,airtable_metadata, +#' by = c("name"="field_name")) +#' +#' metdata_updated <- metadata_joined |> +#' dplyr::mutate(description = field_desc, +#' primary_key = primary_key.y, +#' ) |> +#' dplyr::select(-matches('\\.[xy]|field_desc')) +#' +#' # ODK +#' # get all choices from ODK form +#' +#' dotenv::load_dot_env() +#' +#' ruODK::ru_setup( +#' svc = "https://odk.server.org/v1/projects/5/forms/myproject.svc", +#' un = Sys.getenv("ODK_USERNAME"), +#' pw = Sys.getenv("ODK_PASSWORD"), +#' tz = "GMT", +#' odkc_version = "1.1.2") +#' +#' +#' schema <- ruODK::form_schema_ext() +#' +#' schema$choices_flat <-schema$`choices_english_(en)` |> +#' purrr::map_chr(\(x){ +#' if("labels" %in% names(x)){ +#' paste(x$labels,collapse = ", ") +#' } else { +#' "" +#' } +#' +#' }) +#' +#' data_odk <- ruODK::odata_submission_get() +#' data_odk_rect <- ruODK::odata_submission_rectangle(data_odk) +#' odk_metadata <- create_structural_metadata(data_odk_rect) +#' +#' +#' odk_metadata_joined <- dplyr::left_join(odk_metadata,schema_simple, +#' by = c("name" = "ruodk_name")) +#' +#' odk_metadata_choices <- odk_metadata_joined |> +#' mutate(description = choices_flat) |> +#' select(-choices_flat) +#' +#' +#' } +#' +create_structural_metadata <- function(data){ + + + # create empty data frame + metadata <- tibble::tibble( + name = character(), + description = character(), + units = character(), + term_uri = character(), + comments = character(), + primary_key = logical(), + foreign_key = logical() + ) + + + # get fields + metadata$name = names(data) + + + return(metadata) + + +} diff --git a/man/create_structural_metadata.Rd b/man/create_structural_metadata.Rd new file mode 100644 index 0000000..6c54ca8 --- /dev/null +++ b/man/create_structural_metadata.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_structural_metadata.R +\name{create_structural_metadata} +\alias{create_structural_metadata} +\title{Create Structural Metadata from a dataframe} +\usage{ +create_structural_metadata(data) +} +\arguments{ +\item{data}{Any named object. Expects a table but will work +superficially with lists or named vectors.} +} +\value{ +dataframe with standard metadata requirements +} +\description{ +This is the metadata that describes the data themselves. This metadata can be +generated then joined to pre-existing metadata via field names. +} +\details{ +The metadata table produced has the following elements + +\code{name} = The name of the field. This is taken as is from \code{data}. +\code{description} = Description of that field. May be provided by controlled vocabulary +\code{units} = Units of measure for that field. May or may not apply +\code{term_uri} = Universal Resource Identifier for a term from a controlled vocabulary or schema +\code{comments} = Free text providing additional details about the field +\code{primary_key} = \code{TRUE} or \code{FALSE}, Uniquely identifies each record in the data +\code{foreign_key} = \code{TRUE} or \code{FALSE}, Allows for linkages between data sets. Uniquely identifies +records in a different data set +} +\examples{ +\dontrun{ +df <- data.frame(a = 1:10, b = letters[1:10]) +df_metadata <- create_structural_metadata(df) +write.csv(df_metadata,"df_metadata.csv") + +# lets pretend we are using a dataset which already has +## in airtable, you can add field descriptions directly +## in the base. We want those exported and properly formatted +## in our ohcleandat workflow + + base <- "appMyBaseID" + table_name <- "My Table" + + airtable_metadata <- airtabler::air_generate_metadata_from_api(base = base, + field_names_to_snake_case = FALSE ) |> + dplyr::filter(table_name == {table_name}) |> + dplyr::select(field_name,field_desc,primary_key) + + airtable_df <- airtabler::fetch_all(base = base, table_name = table_name) + + airtable_df_metadata <- create_structural_metadata(airtable_df) + + metadata_joined <- dplyr::left_join(airtable_df_metadata,airtable_metadata, + by = c("name"="field_name")) + + metdata_updated <- metadata_joined |> + dplyr::mutate(description = field_desc, + primary_key = primary_key.y, + ) |> + dplyr::select(-matches('\\\\.[xy]|field_desc')) + +# ODK +# get all choices from ODK form + +dotenv::load_dot_env() + +ruODK::ru_setup( + svc = "https://odk.server.org/v1/projects/5/forms/myproject.svc", + un = Sys.getenv("ODK_USERNAME"), + pw = Sys.getenv("ODK_PASSWORD"), + tz = "GMT", + odkc_version = "1.1.2") + + +schema <- ruODK::form_schema_ext() + +schema$choices_flat <-schema$`choices_english_(en)` |> + purrr::map_chr(\(x){ + if("labels" \%in\% names(x)){ + paste(x$labels,collapse = ", ") + } else { + "" + } + + }) + + data_odk <- ruODK::odata_submission_get() + data_odk_rect <- ruODK::odata_submission_rectangle(data_odk) + odk_metadata <- create_structural_metadata(data_odk_rect) + + + odk_metadata_joined <- dplyr::left_join(odk_metadata,schema_simple, + by = c("name" = "ruodk_name")) + + odk_metadata_choices <- odk_metadata_joined |> + mutate(description = choices_flat) |> + select(-choices_flat) + + +} + +} From 4c22ee7f3757ffce366ed3609e267a2923b18b8b Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Tue, 27 Aug 2024 09:52:32 -0600 Subject: [PATCH 3/8] ignoring .env file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4b29081..49f9416 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ inst/doc docs auth +.env From d856235a043d7644f843054df6903e0c4573b4ed Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Tue, 27 Aug 2024 11:43:49 -0600 Subject: [PATCH 4/8] adding metadata vignette and updating functions --- .Rbuildignore | 1 + .gitignore | 2 +- NAMESPACE | 1 + R/create_structural_metadata.R | 158 +++++++++++- man/create_structural_metadata.Rd | 29 ++- man/update_structural_metadata.Rd | 34 +++ vignettes/data_examples/datapackage.json | 79 ++++++ vignettes/data_examples/my_data.csv | 11 + vignettes/metadata.Rmd | 225 ++++++++++++++++++ .../metadata_examples/structural_metadata.csv | 6 + .../structural_metadata_complete.csv | 6 + vignettes/metadata_template.json | 83 +++++++ 12 files changed, 617 insertions(+), 18 deletions(-) create mode 100644 man/update_structural_metadata.Rd create mode 100644 vignettes/data_examples/datapackage.json create mode 100644 vignettes/data_examples/my_data.csv create mode 100644 vignettes/metadata.Rmd create mode 100644 vignettes/metadata_examples/structural_metadata.csv create mode 100644 vignettes/metadata_examples/structural_metadata_complete.csv create mode 100644 vignettes/metadata_template.json diff --git a/.Rbuildignore b/.Rbuildignore index 4cb98ff..565c6c1 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,4 @@ ^_pkgdown\.yml$ ^docs$ ^pkgdown$ +^\.env$ diff --git a/.gitignore b/.gitignore index 49f9416..2935a49 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ inst/doc docs auth -.env +**/.env diff --git a/NAMESPACE b/NAMESPACE index a0e58b5..facfb63 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ export(read_excel_all_sheets) export(read_googlesheets) export(remove_deletions) export(set_diff) +export(update_structural_metadata) export(validation_checks) importFrom(dplyr,"%>%") importFrom(rlang,":=") diff --git a/R/create_structural_metadata.R b/R/create_structural_metadata.R index 4c4bcda..7da90ee 100644 --- a/R/create_structural_metadata.R +++ b/R/create_structural_metadata.R @@ -6,6 +6,10 @@ #' #' @param data Any named object. Expects a table but will work #' superficially with lists or named vectors. +#' @param primary_key Character. name of field that serves as a primary key +#' @param foreign_key Character. Field or fields that are foreign keys +#' @param additional_elements Empty tibble with structural metadata elements and +#' their types. #' #' @details #' @@ -27,9 +31,18 @@ #' @examples #' \dontrun{ #' df <- data.frame(a = 1:10, b = letters[1:10]) -#' df_metadata <- create_structural_metadata(df) +#' df_metadata <- ohcleandat::create_structural_metadata(df) #' write.csv(df_metadata,"df_metadata.csv") #' +#' +#' Additional elements can be added via a tibble +#' additional_elements <- tibble::tibble(table_name = NA_character_, +#' created_by = NA_character_, +#' updated = NA +#' ) +#' df_metadata <- ohcleandat::create_structural_metadata(df, +#' additional_elements = additional_elements) +#' #' # lets pretend we are using a dataset which already has #' ## in airtable, you can add field descriptions directly #' ## in the base. We want those exported and properly formatted @@ -45,7 +58,7 @@ #' #' airtable_df <- airtabler::fetch_all(base = base, table_name = table_name) #' -#' airtable_df_metadata <- create_structural_metadata(airtable_df) +#' airtable_df_metadata <- ohcleandat::create_structural_metadata(airtable_df) #' #' metadata_joined <- dplyr::left_join(airtable_df_metadata,airtable_metadata, #' by = c("name"="field_name")) @@ -83,7 +96,7 @@ #' #' data_odk <- ruODK::odata_submission_get() #' data_odk_rect <- ruODK::odata_submission_rectangle(data_odk) -#' odk_metadata <- create_structural_metadata(data_odk_rect) +#' odk_metadata <- ohcleandat::create_structural_metadata(data_odk_rect) #' #' #' odk_metadata_joined <- dplyr::left_join(odk_metadata,schema_simple, @@ -96,26 +109,145 @@ #' #' } #' -create_structural_metadata <- function(data){ +create_structural_metadata <- function(data, + primary_key = "", + foreign_key= "", + additional_elements = tibble::tibble()){ # create empty data frame metadata <- tibble::tibble( - name = character(), - description = character(), - units = character(), - term_uri = character(), - comments = character(), - primary_key = logical(), - foreign_key = logical() + name = names(data), + description = NA_character_, + units = NA_character_, + term_uri = NA_character_, + comments = NA_character_, + primary_key = FALSE, + foreign_key = FALSE ) + if(nrow(additional_elements) > 0){ + + # check that elements aren't repeated + if(any(names(additional_elements) %in% names(metadata))) { + names_index <- which(names(additional_elements) %in% names(metadata)) + repeat_names <- paste(names(additional_elements)[names_index],collapse = ", ") + msg <- sprintf("additional_elements repeats the following fields: %s",additional_elements) + rlang::abort(msg) + } + metadata<- cbind(metadata,additional_elements) + } + + if(nzchar(primary_key)){ + + pkey_check <- metadata$name == primary_key - # get fields - metadata$name = names(data) + if(any(pkey_check)){ + metadata[which(pkey_check),"primary_key"] <- TRUE + } else { + sprintf("Primary key not in data: %s", primary_key) + } + } + + if(any(nzchar(foreign_key))){ + + fkey_check <- metadata$name %in% foreign_key + + if(any(fkey_check)){ + metadata[which(fkey_check),"foreign_key"] <- TRUE + } else { + sprintf("Foreign key not in data: %s", foreign_key) + } + } return(metadata) +} + + +#' Update structural metadata +#' +#' Appends rows and/or columns to existing metadata, change primary key and/or +#' adds foreign keys. +#' +#' @param data Any named object. Expects a table but will work +#' superficially with lists or named vectors. +#' @param metadata Data frame. Output from `create_structural_metadata` +#' @param primary_key Character. OPTIONAL Primary key in the data +#' @param foreign_key Character. OPTIONAL Foreign key or keys in the data +#' @param additional_elements data frame. OPTIONAL Empty tibble with structural +#' metadata elements and their types. +#' +#' @return data.frame +#' @export +#' +#' @examples +update_structural_metadata <- function(data,metadata,primary_key = "", foreign_key = "",additional_elements = tibble::tibble()){ + + existing_fkeys <- metadata |> + dplyr::filter(foreign_key) |> + dplyr::pull(name) + + if(rlang::is_empty(existing_fkeys)){ + existing_fkeys <- "" + } + + if(any(nzchar(foreign_key))){ + foreign_key <- c(foreign_key,existing_fkeys) + } else { + foreign_key <- existing_fkeys + + } + + if(!nzchar(primary_key)){ + primary_key <- metadata |> + dplyr::filter(primary_key) |> + dplyr::pull(name) + } + + # make new structural metadata + new_sm <- ohcleandat::create_structural_metadata(data, + additional_elements = additional_elements, + primary_key = primary_key , + foreign_key= foreign_key) + + # cols to add old metadata + if(nrow(additional_elements) > 0){ + + # check that elements aren't repeated + if(any(names(additional_elements) %in% names(metadata))) { + names_index <- which(names(additional_elements) %in% names(metadata)) + repeat_names <- paste(names(additional_elements)[names_index],collapse = ", ") + msg <- sprintf("additional_elements repeats the following fields: %s",additional_elements) + rlang::abort(msg) + } + metadata<- cbind(metadata,additional_elements) + } + + # rows to append + data_to_append <- dplyr::anti_join(new_sm,metadata,"name") + + # rows to drop + rows_to_drop <- dplyr::anti_join(metadata,new_sm,"name") + + metadata_full <- rbind(metadata, data_to_append) + + if(nrow(rows_to_drop) > 0){ + name_filter <- rows_to_drop |> + dplyr::pull(name) + + metadata_full <- metadata_full |> + dplyr::filter(!name %in% {name_filter}) + + } + + + # update keys + metadata_full$primary_key <- new_sm$primary_key + metadata_full$foreign_key <- new_sm$foreign_key + + return(metadata_full) } + diff --git a/man/create_structural_metadata.Rd b/man/create_structural_metadata.Rd index 6c54ca8..9445cc4 100644 --- a/man/create_structural_metadata.Rd +++ b/man/create_structural_metadata.Rd @@ -4,11 +4,23 @@ \alias{create_structural_metadata} \title{Create Structural Metadata from a dataframe} \usage{ -create_structural_metadata(data) +create_structural_metadata( + data, + primary_key = "", + foreign_key = "", + additional_elements = tibble::tibble() +) } \arguments{ \item{data}{Any named object. Expects a table but will work superficially with lists or named vectors.} + +\item{primary_key}{Character. name of field that serves as a primary key} + +\item{foreign_key}{Character. Field or fields that are foreign keys} + +\item{additional_elements}{Empty tibble with structural metadata elements and +their types.} } \value{ dataframe with standard metadata requirements @@ -32,9 +44,18 @@ records in a different data set \examples{ \dontrun{ df <- data.frame(a = 1:10, b = letters[1:10]) -df_metadata <- create_structural_metadata(df) +df_metadata <- ohcleandat::create_structural_metadata(df) write.csv(df_metadata,"df_metadata.csv") + +Additional elements can be added via a tibble +additional_elements <- tibble::tibble(table_name = NA_character_, +created_by = NA_character_, +updated = NA +) +df_metadata <- ohcleandat::create_structural_metadata(df, + additional_elements = additional_elements) + # lets pretend we are using a dataset which already has ## in airtable, you can add field descriptions directly ## in the base. We want those exported and properly formatted @@ -50,7 +71,7 @@ write.csv(df_metadata,"df_metadata.csv") airtable_df <- airtabler::fetch_all(base = base, table_name = table_name) - airtable_df_metadata <- create_structural_metadata(airtable_df) + airtable_df_metadata <- ohcleandat::create_structural_metadata(airtable_df) metadata_joined <- dplyr::left_join(airtable_df_metadata,airtable_metadata, by = c("name"="field_name")) @@ -88,7 +109,7 @@ schema$choices_flat <-schema$`choices_english_(en)` |> data_odk <- ruODK::odata_submission_get() data_odk_rect <- ruODK::odata_submission_rectangle(data_odk) - odk_metadata <- create_structural_metadata(data_odk_rect) + odk_metadata <- ohcleandat::create_structural_metadata(data_odk_rect) odk_metadata_joined <- dplyr::left_join(odk_metadata,schema_simple, diff --git a/man/update_structural_metadata.Rd b/man/update_structural_metadata.Rd new file mode 100644 index 0000000..bd31c4b --- /dev/null +++ b/man/update_structural_metadata.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_structural_metadata.R +\name{update_structural_metadata} +\alias{update_structural_metadata} +\title{Update structural metadata} +\usage{ +update_structural_metadata( + data, + metadata, + primary_key = "", + foreign_key = "", + additional_elements = tibble::tibble() +) +} +\arguments{ +\item{data}{Any named object. Expects a table but will work +superficially with lists or named vectors.} + +\item{metadata}{Data frame. Output from \code{create_structural_metadata}} + +\item{primary_key}{Character. OPTIONAL Primary key in the data} + +\item{foreign_key}{Character. OPTIONAL Foreign key or keys in the data} + +\item{additional_elements}{data frame. OPTIONAL Empty tibble with structural +metadata elements and their types.} +} +\value{ +data.frame +} +\description{ +Appends rows and/or columns to existing metadata, change primary key and/or +adds foreign keys. +} diff --git a/vignettes/data_examples/datapackage.json b/vignettes/data_examples/datapackage.json new file mode 100644 index 0000000..25714e9 --- /dev/null +++ b/vignettes/data_examples/datapackage.json @@ -0,0 +1,79 @@ +{ + "resources": [ + { + "name": "my_data", + "path": "my_data.csv", + "profile": "tabular-data-resource", + "format": "csv", + "mediatype": "text/csv", + "encoding": "UTF-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "date", + "description": "Date measurement was taken", + "units": null, + "term_uri": "https://schema.org/Date", + "comments": "Be careful when reading to excel. Date format may be changed", + "primary_key": false, + "foreign_key": false + }, + { + "name": "measurement", + "type": "number", + "description": "distance to ephemeral pan from livestock pen", + "units": "meters", + "term_uri": "https://schema.org/Distance", + "comments": null, + "primary_key": false, + "foreign_key": false + }, + { + "name": "measured_by", + "type": "string", + "description": "A person who measured the distance", + "units": null, + "term_uri": "https://schema.org/name", + "comments": null, + "primary_key": false, + "foreign_key": true + }, + { + "name": "site_name", + "type": "string", + "description": "Name of site", + "units": null, + "term_uri": "https://schema.org/name", + "comments": null, + "primary_key": false, + "foreign_key": true + }, + { + "name": "key", + "type": "number", + "description": "unique id for each row", + "units": null, + "term_uri": "https://schema.org/identifier", + "comments": null, + "primary_key": true, + "foreign_key": false + } + ] + } + } + ], + "metadata": { + "creator": [ + { + "name": "A. Person" + }, + { + "name": "B. Person" + } + ], + "description": "This is the abstract", + "identifier": "10.5281/zenodo.104126", + "title": "Example Dataset" + } +} diff --git a/vignettes/data_examples/my_data.csv b/vignettes/data_examples/my_data.csv new file mode 100644 index 0000000..0d42a6b --- /dev/null +++ b/vignettes/data_examples/my_data.csv @@ -0,0 +1,11 @@ +"date","measurement","measured_by","site_name","key" +2024-08-26,22,"Johana","c",1 +2024-08-27,94,"Collin","c",2 +2024-08-28,85,"Collin","b",3 +2024-08-29,63,"Collin","d",4 +2024-08-30,92,"Collin","e",5 +2024-08-31,8,"Collin","a",6 +2024-09-01,96,"Collin","e",7 +2024-09-02,53,"Johana","b",8 +2024-09-03,11,"Johana","a",9 +2024-09-04,97,"Johana","d",10 diff --git a/vignettes/metadata.Rmd b/vignettes/metadata.Rmd new file mode 100644 index 0000000..9ef8aa0 --- /dev/null +++ b/vignettes/metadata.Rmd @@ -0,0 +1,225 @@ +--- +title: "Metadata: Creating Standard Metadata with {ohcleandat} and {deposits}" +output: rmarkdown::html_vignette +vignette: > +%\VignetteIndexEntry{metadata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + --- + + ```{r, include = FALSE} +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) +``` + +```{r eval=FALSE} +library(ohcleandat) +library(deposits) +library(frictionless) +``` + +One if the primary aims of cleaning data is to be able to work with it. Whether +working with data shortly after it was created, or years later, proper metadata +make it easier to know whats in a dataset or data package. + +In extremely broad terms, there are two types of metadata: +1) descriptive metadata - the who what where why when of the dataset. +2) structural metadata - what do individual fields mean and how to do fit together? +This vignette focuses on creating descriptive metadata with the {deposits} and +{frictionless} packages and structural metadata using functions in {ohcleandat}. + +## Creating structural metadata with {ohcleandat} + +Because {ohcleandat} largely deals with cleaning tabular data, we can use +a standard (and extensible) structural metadata format to describe outputs. + +The basic structural metadata table produced has the following elements + +- `name` = The name of the field. This is taken as is from `data`. +- `description` = Description of that field. May be provided by controlled vocabulary +- `units` = Units of measure for that field. May or may not apply +- `term_uri` = Universal Resource Identifier for a term from a controlled vocabulary or schema +- `comments` = Free text providing additional details about the field +- `primary_key` = `TRUE` or `FALSE`, Uniquely identifies each record in the data +- `foreign_key` = `TRUE` or `FALSE`, Allows for linkages between data sets. Uniquely identifies +records in a different data set + +Additional metadata elements can be added by creating then column binding an empty +`data.frame` to the basic structure. + + +### Basic Example: +```{r eval=FALSE} + +## read in your data +data_to_describe <- tibble::tibble(date = as.Date(19961:19970), + measurement = sample(1:100,10), + measured_by = sample(c("Collin","Johana"),size = 10,replace = TRUE), + site_name = sample(letters[1:5],size = 10,replace = TRUE), + field_we_dont_need = "nothing useful here" +) + +# create metadata +structural_metadata <- ohcleandat::create_structural_metadata(data_to_describe) + +# write to csv +structural_metadata |> + write.csv(file = "metadata_examples/structural_metadata.csv",row.names = FALSE) + +# Fill in metadata by hand and read + +structural_metadata_complete <- readr::read_csv(file = "metadata_examples/structural_metadata_complete.csv") +``` + + +### What if the structure of my data change? + +Do I have to re-write my metadata? Maybe! But in certain cirucumstances you +can just update the metadata. + + + +```{r eval=FALSE} + +## oops I forgot to add a primary key +data_to_describe$key <- 1:10 + + +# add primary key and label it as a primary key in the metadata + +structural_metadata_pk<- ohcleandat::update_structural_metadata(data = data_to_describe,metadata = structural_metadata_complete, primary_key = "key") + +# oh also, the measured_by field is a foreign key + +structural_metadata_fk <- ohcleandat::update_structural_metadata(data = data_to_describe, + metadata = structural_metadata_pk, + foreign_key = "measured_by") + +## yeah we deleted that field - sorry! + +data_to_describe_drop_field <- data_to_describe |> + dplyr::select(-field_we_dont_need) + +structural_metadata_clean <- ohcleandat::update_structural_metadata( + data = data_to_describe_drop_field, + metadata = structural_metadata_fk) + +write.csv(structural_metadata_clean, + file = "metadata_examples/structural_metadata.csv", + row.names = FALSE) + +``` + +## Depositing data into an archive with {deposits} + +Okay - so I want to deposit my data using the [{deposits}](https://docs.ropensci.org/deposits/) package. {deposits} uses +the [frictionless data standard](https://docs.ropensci.org/frictionless/articles/frictionless.html) +so I end up with this thing called `datapackage.json` that stores all of my metadata. + +The `datapackage.json` structural metadata is pretty minimal - it only includes +field name and type (numeric, character, etc). So we want to add our metadata +to that. [add something about DCMI terms] + +See the [{deposits} setup guide](https://docs.ropensci.org/deposits/articles/install-setup.html) for +getting an api token and installing the package. + +```{r eval=FALSE} +library(deposits) + +# set deposits token +# this can also be done in the `.Renviron` file or +# via a .env file using the {dotenv} package +# dotenv::load_dot_env(file = "../.env") +Sys.setenv ("ZENODO_SANDBOX_TOKEN" = "") + +# make sure your data are saved to a file + +write.csv(data_to_describe_drop_field,file = "data_examples/my_data.csv",row.names = FALSE) + +# make sure your updated metadata file is loaded + +structural_metadata <- readr::read_csv("metadata_examples/structural_metadata_complete.csv") + + +# Create descriptive metadata + +# check valid dcmi terms by calling deposits::dcmi_terms() +# check see term defintions by calling: deposits::deposits_metadata_template(filename = "metadata_template.json") + +descriptive_metadata <- list ( + title = "Example Dataset", + description = "This is the abstract", + creator = list (list (name = "A. Person"), list (name = "B. Person")) + # , accessRights = "open" +) + +# create a new client +cli <- deposits::depositsClient$new(service = "zenodo", + metadata = descriptive_metadata, sandbox = TRUE) + + +# create a new deposit item - this creates a placeholder in zenodo +# for your items +cli$deposit_new() + +# add your data - you can add individual files or whole folders +# this will make a datapackage.json item in data_examples +cli$deposit_add_resource(path = "data_examples/my_data.csv") + +# Take a peak at the `datapackage.json` file - you'll see the first section +# describes the csv file, the second section describes the data in the csv, +# the third section contains the descriptive metadata we created + +# Add in your metadata + +data_package <- frictionless::read_package("data_examples/datapackage.json") + +# get the schema for a resource in the data package +my_data_schema <- data_package|> + get_schema("my_data") + + +## build up schema based on structural metadata + +for(idx in 1:length(my_data_schema$fields)){ + # item to build out + x <- my_data_schema$fields[[idx]] + for(idy in 1:length(structural_metadata)){ + + y <- structural_metadata[idx,idy][[1]] + # get property name + property_to_add_name <- names(structural_metadata)[idy] + + # skip properties that already exist + if(property_to_add_name %in% names(x)){ + next() + } + + property_to_add_value <- y + names(property_to_add_value) <- property_to_add_name + x <- c(x, property_to_add_value) + } + + # update + my_data_schema$fields[[idx]] <- x +} + +# update the datapackage.json +data_package <- data_package|> + frictionless::remove_resource("my_data") |> + frictionless::add_resource(resource_name = "my_data", + data = "data_examples/my_data.csv", + schema = my_data_schema, + ) + +# write the datapackage.json +frictionless::write_package(data_package,directory = "data_examples/") + +# upload to zenodo - this creates a draft deposit in Zenodo +cli$deposit_upload_file(path = "data_examples/") + +# there are methods for embargoing or restricting + +``` diff --git a/vignettes/metadata_examples/structural_metadata.csv b/vignettes/metadata_examples/structural_metadata.csv new file mode 100644 index 0000000..2cbcd24 --- /dev/null +++ b/vignettes/metadata_examples/structural_metadata.csv @@ -0,0 +1,6 @@ +"name","description","units","term_uri","comments","primary_key","foreign_key" +"date","Date measurement was taken",NA,"https://schema.org/Date","Be careful when reading to excel. Date format may be changed",FALSE,FALSE +"measurement","distance to ephemeral pan from livestock pen","meters","https://schema.org/Distance",NA,FALSE,FALSE +"measured_by","A person who measured the distance",NA,"https://schema.org/name",NA,FALSE,TRUE +"site_name","Name of site",NA,"https://schema.org/name",NA,FALSE,TRUE +"key",NA,NA,NA,NA,TRUE,FALSE diff --git a/vignettes/metadata_examples/structural_metadata_complete.csv b/vignettes/metadata_examples/structural_metadata_complete.csv new file mode 100644 index 0000000..44253fd --- /dev/null +++ b/vignettes/metadata_examples/structural_metadata_complete.csv @@ -0,0 +1,6 @@ +name,description,units,term_uri,comments,primary_key,foreign_key +date,Date measurement was taken,NA,https://schema.org/Date,Be careful when reading to excel. Date format may be changed,FALSE,FALSE +measurement,distance to ephemeral pan from livestock pen,meters,https://schema.org/Distance,NA,FALSE,FALSE +measured_by,A person who measured the distance,NA,https://schema.org/name,NA,FALSE,TRUE +site_name,Name of site,NA,https://schema.org/name,NA,FALSE,TRUE +key,unique id for each row,NA,https://schema.org/identifier,NA,TRUE,FALSE \ No newline at end of file diff --git a/vignettes/metadata_template.json b/vignettes/metadata_template.json new file mode 100644 index 0000000..4a8b012 --- /dev/null +++ b/vignettes/metadata_template.json @@ -0,0 +1,83 @@ +{ + "_note": "Fields like this starting with underscores are comments. Please delete this field, and all fields in this template except for those you wish to use for your deposit. This template may NOT be used in anything like this default form to construct a 'deposits' client. Many values require editing to comply with expected formats described throughout, such as dates or fields expected to accord with some fixed vocabulary, or modification from text descriptions to JSON objects. Please refer to the main 'dc/schema' file of the 'deposits' package for full details.", + "abstract": "Abstract text", + "accessRights": "Access rights, which for Zenodo must be one of 'open', 'embargoed', 'restricted', or 'closed'.", + "accrualMethod": "See https://www.dublincore.org/specifications/dublin-core/collection-description/accrual-method/; must be one of 'deposit', 'donation', 'purchase', 'loan', 'license', or 'item_creation'", + "accrualPeriodicity": "Recommended vocabulary at https://www.dublincore.org/specifications/dublin-core/collection-description/frequency/", + "accrualPolicy": "See https://www.dublincore.org/specifications/dublin-core/collection-description/accrual-policy/. Must be one of 'closed', 'passive', 'active', 'partial'.", + "alternative": "Alternative name or title for the resource", + "audience": "A class of agents for whom the resource is intended or useful. Recommended practice is to use this property with non-literal values from a vocabulary of audience types.", + "available": "Date from which resource will be available.", + "bibliographicCitation": "A bibliographic reference for the resource.", + "conformsTo": "An established standard to which the described resource conforms.", + "contributor": [ + { + "name": "Name of first contributor. (Note that authors are specified in the 'creator' field, and a 'contributor' is not a 'creator'.)", + "type": "(Optional) For Zenodo; one of 'ContactPerson', 'DataCollector', 'DataCurator', 'DataManager', 'istributor', 'Editor', 'HostingInstitution', 'Producer', 'ProjectLeader', 'ProjectManager', 'ProjectMember', 'RegistrationAgency', 'RegistrationAuthority', 'RelatedPerson', 'Researcher', 'ResearchGroup', 'RightsHolder', 'upervisor', 'Sponsor', 'WorkPackageLeader', 'Other'.", + "affiliantion": "(Optional) Affiliation of contributor", + "orcid": "(Optional) ORCID ID of contributor.", + "gnd": "(Optional) GND number of contributor" + }, + { + "name": "Name of second contributor" + } + ], + "coverage": "The spatial or temporal topic of the resource, spatial applicability of the resource, or jurisdiction under which the resource is relevant.", + "created": "A date or date-time string specifying when the resource was created.", + "creator": [ + { + "name": "Name of first creator.", + "affliantion": "(Optional, Zenodo only)", + "id": "(Optional, figshare only) Integer 'id' on Figshare of creator.", + "first_name": "(Optional, figshare only)", + "last_name": "(Optional, figshare only)", + "email": "(Optional, figshare only)", + "orcid": "(Optional)", + "gnd": "(Optional, Zenodo only)" + }, + { + "name": "Name of second creator." + } + ], + "date": "A point or period of time associated with an event in the lifecycle of the resource.", + "dateAccepted": "Date of acceptance of resource", + "dateCopyrighted": "Date of copyright of resource", + "dateSubmitted": "Date of submission of resource", + "description": "Description of resource.", + "educationLevel": "A class of agents, defined in terms of progression through an educational or training context, for which the described resource is intended.", + "extent": "The size or duration of the resource. Recommended practice is to specify the file size in megabytes and duration in ISO 8601 format.", + "format": "The file format, physical medium, or dimensions of the resource. For Zenodo, one of 'publication', 'poster', 'presentation', 'dataset', 'image', 'video', 'software', 'lesson', 'physicalobject', 'other'. For Figshare, one of ['figure', 'online_resource', 'preprint', 'book', 'conference_contribution', 'media', 'dataset', 'poster', 'journal_contribution', 'presentation', 'thesis', 'software'", + "hasFormat": "A related resource that is substantially the same as the pre-existing described resource, but in another format. This property is an inverse property of Is Format Of.", + "hasPart": "A related resource that is included either physically or logically in the described resource. This property is an inverse property of Is Part Of.", + "hasVersion": "A related resource that is a version, edition, or adaptation of the described resource. Changes in version imply substantive changes in content rather than differences in format. This property is an inverse property of Is Version Of.", + "identifier": "An unambiguous reference to the resource within a given context. Recommended practice is to identify the resource by means of a string conforming to an identification system. Examples include International Standard Book Number (ISBN), Digital Object Identifier (DOI), and Uniform Resource Name (URN). Persistent identifiers should be provided as HTTP URIs.", + "instructionalMethod": "A process, used to engender knowledge, attitudes and skills, that the described resource is designed to support. Instructional Method typically includes ways of presenting instructional materials or conducting instructional activities, patterns of learner-to-learner and learner-to-instructor interactions, and mechanisms by which group and individual levels of learning are measured. Instructional methods include all aspects of the instruction and learning processes from planning and implementation through evaluation and feedback.", + "isFormatOf": "A pre-existing related resource that is substantially the same as the described resource, but in another format. This property is an inverse property of Has Format.", + "isPartOf": "A related resource in which the described resource is physically or logically included. This property is an inverse property of Has Part.", + "isReferencedBy": "A related resource that references, cites, or otherwise points to the described resource. This property is an inverse property of References. For Zenodo, this must be an array of items, each of which specifies a 'name' and 'relation', where 'relation' follows a fixed vocabulary.", + "isReplacedBy": "A related resource that supplants, displaces, or supersedes the described resource. This property is an inverse property of Replaces. For Zenodo, this must be an array of items, each of which specifies a 'name' and 'relation', where 'relation' follows a fixed vocabulary.", + "isRequiredBy": "A related resource that requires the described resource to support its function, delivery, or coherence. This property is an inverse property of Requires. For Zenodo, this must be an array of items, each of which specifies a 'name' and 'relation', where 'relation' follows a fixed vocabulary.", + "issued": "Date or date-time of formal issuance of the resource.", + "isVersionOf": "A related resource of which the described resource is a version, edition, or adaptation. Changes in version imply substantive changes in content rather than differences in format. This property is an inverse property of Has Version. For Zenodo, this must be an array of items, each of which specifies an 'idenifier', 'relation', and 'resource_type', where 'relation' follows a fixed vocabulary.", + "language": "A language of the resource. Recommended practice is to use either a non-literal value representing a language from a controlled vocabulary such as ISO 639-2 or ISO 639-3, or a literal value consisting of an IETF Best Current Practice 47 [IETF-BCP47] language tag.", + "license": "A single string for Zenodo. An integer for Figshare, corresponding to their enumeration of possible licenses.", + "mediator": "An entity that mediates access to the resource. In an educational context, a mediator might be a parent, teacher, teaching assistant, or care-giver.", + "medium": "The material or physical carrier of the resource.", + "modified": "Date or date-time on which the resource was changed.", + "provenance": "A statement of any changes in ownership and custody of the resource since its creation that are significant for its authenticity, integrity, and interpretation. The statement may include a description of any changes successive custodians made to the resource.", + "publisher": "An entity responsible for making the resource available.", + "references": "A related resource that is referenced, cited, or otherwise pointed to by the described resource. This property is an inverse property of Is Referenced By.", + "relation": "A related resource. Recommended practice is to identify the related resource by means of a URI. If this is not possible or feasible, a string conforming to a formal identification system may be provided. For Zenodo, this must be an array of 'related_identifiers', each item of which has an 'identifier', a 'relation' adhering to a fixed vocabulary, and a 'resource_type'.", + "replaces": "A related resource that is supplanted, displaced, or superseded by the described resource. This property is an inverse property of Is Replaced By. For Zenodo, this must be an array of 'related_identifiers', each item of which has an 'identifier', a 'relation' adhering to a fixed vocabulary, and a 'resource_type'.", + "requires": "A related resource that is required by the described resource to support its function, delivery, or coherence. This property is an inverse property of Is Required By. For Zenodo, this must be an array of 'related_identifiers', each item of which has an 'identifier', a 'relation' adhering to a fixed vocabulary, and a 'resource_type'.", + "rights": "Information about rights held in and over the resource. Typically, rights information includes a statement about various property rights associated with the resource, including intellectual property rights. Recommended practice is to refer to a rights statement with a URI. If this is not possible or feasible, a literal value (name, label, or short text) may be provided.", + "rightsHolder": "A person or organization owning or managing rights over the resource. Recommended practice is to refer to the rights holder with a URI. If this is not possible or feasible, a literal value that identifies the rights holder may be provided.", + "source": "A related resource from which the described resource is derived. The described resource may be derived from the related resource in whole or in part. Best practice is to identify the related resource by means of a URI or a string conforming to a formal identification system.", + "spatial": "Spatial characteristics of the resource.", + "subject": "A topic of the resource. Recommended practice is to refer to the subject with a URI. If this is not possible or feasible, a literal value that identifies the subject may be provided. Both should preferably refer to a subject in a controlled vocabulary. For Zenodo, this can include an array of keywords.", + "tableOfContents": "A list of subunits of the resource.", + "temporal": "Temporal characteristics of the resource.", + "title": "A name given to the resource.", + "type": "The nature or genre of the resource. Recommended practice is to use a controlled vocabulary such as the DCMI Type Vocabulary [DCMI-TYPE]. To describe the file format, physical medium, or dimensions of the resource, use the property Format.", + "valid": "Date or date-time (often a range) of validity of a resource. Recommended practice is to describe the date, date/time, or period of time as recommended for the property Date, of which this is a subproperty." +} From 59f8dfbe1d8da4bbdf32ac969cc901a8e260b71f Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Tue, 27 Aug 2024 11:45:58 -0600 Subject: [PATCH 5/8] fixed indent issue with metadata vignette --- vignettes/metadata.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/metadata.Rmd b/vignettes/metadata.Rmd index 9ef8aa0..c60a47d 100644 --- a/vignettes/metadata.Rmd +++ b/vignettes/metadata.Rmd @@ -7,7 +7,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- - ```{r, include = FALSE} +```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -126,7 +126,7 @@ See the [{deposits} setup guide](https://docs.ropensci.org/deposits/articles/ins getting an api token and installing the package. ```{r eval=FALSE} -library(deposits) + # set deposits token # this can also be done in the `.Renviron` file or From d94489e6a32ed9f1f366985c33aeaa1cbbe28807 Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Wed, 28 Aug 2024 12:13:05 -0600 Subject: [PATCH 6/8] adding function for expanding fl metadata --- R/expand_frictionless_metadata.R | 96 +++++++++++++++++++++++++++++ man/expand_frictionless_metadata.Rd | 62 +++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 R/expand_frictionless_metadata.R create mode 100644 man/expand_frictionless_metadata.Rd diff --git a/R/expand_frictionless_metadata.R b/R/expand_frictionless_metadata.R new file mode 100644 index 0000000..3bd74dd --- /dev/null +++ b/R/expand_frictionless_metadata.R @@ -0,0 +1,96 @@ +#' Expand Frictionless Metadata with structural metadata +#' +#' Loops over elements in the structural metadata and adds them to frictionless +#' metadata schema. +#' +#' @param structural_metadata Dataframe. Structural metadata from +#' `create_structural_metadata` or `update_structural_metadata` +#' @param resource_name Character. Item within the datapackage to be updated +#' @param resource_path Character. Path to csv file +#' @param data_package_path Character. Path to datapackage.json file +#' +#' @return Updates the datapackage, returns nothing +#' @export +#' +#' @examples +#' \dontrun{ +#' +#' # read in file +#' data_path <- "my/data.csv" +#' data <- read.csv(data_path) +#' +#' # create structural metadata +#' data_codebook <- create_structural_metadata(data) +#' +#' # update structural metadata +#' write.csv(data_codebook,"my/codebook.csv", row.names = FALSE) +#' +#' data_codebook_updated <- read.csv(""my/codebook.csv"") +#' +#' # create frictionless package - this is done automatically with the +#' # deposits package +#' my_package <- +#' create_package() |> +#' add_resource(resource_name = "data", data = data_path) +#' +#' write_package(my_package,"my") +#' +#' expand_frictionless_metadata(structural_metadata = data_codebook_updated, +#' resource_name = "data", +#' resource_path = data_path, +#' data_package_path = "my/datapackage.json" +#' ) +#' +#' } +#' +expand_frictionless_metadata <- function(structural_metadata, + resource_name, + resource_path, + data_package_path ){ + + data_package <- frictionless::read_package(data_package_path) + + data_package_dir <- dirname(data_package_path) + + # get the schema for a resource in the data package + my_data_schema <- data_package|> + frictionless::get_schema(resource_name) + + ## build up schema based on structural metadata + + for(idx in 1:length(my_data_schema$fields)){ + # item to build out + x <- my_data_schema$fields[[idx]] + for(idy in 1:length(structural_metadata)){ + + y <- structural_metadata[idx,idy][[1]] + # get property name + property_to_add_name <- names(structural_metadata)[idy] + + # skip properties that already exist + if(property_to_add_name %in% names(x)){ + next() + } + + property_to_add_value <- y + names(property_to_add_value) <- property_to_add_name + x <- c(x, property_to_add_value) + } + + # update + my_data_schema$fields[[idx]] <- x + } + + # update the datapackage.json + data_package <- data_package|> + frictionless::remove_resource(resource_name) |> + frictionless::add_resource(resource_name = resource_name, + data = "data_examples/my_data.csv", + schema = my_data_schema, + ) + + # write the datapackage.json + frictionless::write_package(data_package,directory = data_package_dir) + + invisible() +} diff --git a/man/expand_frictionless_metadata.Rd b/man/expand_frictionless_metadata.Rd new file mode 100644 index 0000000..58e5bad --- /dev/null +++ b/man/expand_frictionless_metadata.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expand_frictionless_metadata.R +\name{expand_frictionless_metadata} +\alias{expand_frictionless_metadata} +\title{Expand Frictionless Metadata with structural metadata} +\usage{ +expand_frictionless_metadata( + structural_metadata, + resource_name, + resource_path, + data_package_path +) +} +\arguments{ +\item{structural_metadata}{Dataframe. Structural metadata from +\code{create_structural_metadata} or \code{update_structural_metadata}} + +\item{resource_name}{Character. Item within the datapackage to be updated} + +\item{resource_path}{Character. Path to csv file} + +\item{data_package_path}{Character. Path to datapackage.json file} +} +\value{ +Updates the datapackage, returns nothing +} +\description{ +Loops over elements in the structural metadata and adds them to frictionless +metadata schema. +} +\examples{ +\dontrun{ + +# read in file +data_path <- "my/data.csv" +data <- read.csv(data_path) + +# create structural metadata +data_codebook <- create_structural_metadata(data) + +# update structural metadata +write.csv(data_codebook,"my/codebook.csv", row.names = FALSE) + +data_codebook_updated <- read.csv(""my/codebook.csv"") + +# create frictionless package - this is done automatically with the +# deposits package +my_package <- + create_package() |> + add_resource(resource_name = "data", data = data_path) + + write_package(my_package,"my") + +expand_frictionless_metadata(structural_metadata = data_codebook_updated, + resource_name = "data", + resource_path = data_path, + data_package_path = "my/datapackage.json" + ) + +} + +} From 88c25135876697200ccb1f7dd5b97faaf1eb89dc Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Wed, 28 Aug 2024 12:13:37 -0600 Subject: [PATCH 7/8] incorporating fl metadata function into vignettes --- DESCRIPTION | 1 + NAMESPACE | 1 + R/create_structural_metadata.R | 4 +- man/update_structural_metadata.Rd | 3 + vignettes/data_examples/my_data.csv | 20 +++--- vignettes/metadata.Rmd | 61 ++++--------------- .../metadata_examples/structural_metadata.csv | 2 +- 7 files changed, 31 insertions(+), 61 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a65ea22..5f70445 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,7 @@ Imports: arsenal, containerTemplateUtils (>= 0.0.0.9006), dplyr, + frictionless, googledrive, googlesheets4, here, diff --git a/NAMESPACE b/NAMESPACE index facfb63..c5441a1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(detect_language) export(download_dropbox) export(download_googledrive_files) export(dropbox_upload) +export(expand_frictionless_metadata) export(get_dropbox_val_logs) export(get_odk_form_schema) export(get_odk_responses) diff --git a/R/create_structural_metadata.R b/R/create_structural_metadata.R index 7da90ee..a207b41 100644 --- a/R/create_structural_metadata.R +++ b/R/create_structural_metadata.R @@ -179,10 +179,10 @@ create_structural_metadata <- function(data, #' @param additional_elements data frame. OPTIONAL Empty tibble with structural #' metadata elements and their types. #' +#' @note See vignette on metadata for examples +#' #' @return data.frame #' @export -#' -#' @examples update_structural_metadata <- function(data,metadata,primary_key = "", foreign_key = "",additional_elements = tibble::tibble()){ existing_fkeys <- metadata |> diff --git a/man/update_structural_metadata.Rd b/man/update_structural_metadata.Rd index bd31c4b..72e9811 100644 --- a/man/update_structural_metadata.Rd +++ b/man/update_structural_metadata.Rd @@ -32,3 +32,6 @@ data.frame Appends rows and/or columns to existing metadata, change primary key and/or adds foreign keys. } +\note{ +See vignette on metadata for examples +} diff --git a/vignettes/data_examples/my_data.csv b/vignettes/data_examples/my_data.csv index 0d42a6b..5386d82 100644 --- a/vignettes/data_examples/my_data.csv +++ b/vignettes/data_examples/my_data.csv @@ -1,11 +1,11 @@ "date","measurement","measured_by","site_name","key" -2024-08-26,22,"Johana","c",1 -2024-08-27,94,"Collin","c",2 -2024-08-28,85,"Collin","b",3 -2024-08-29,63,"Collin","d",4 -2024-08-30,92,"Collin","e",5 -2024-08-31,8,"Collin","a",6 -2024-09-01,96,"Collin","e",7 -2024-09-02,53,"Johana","b",8 -2024-09-03,11,"Johana","a",9 -2024-09-04,97,"Johana","d",10 +2024-08-26,29,"Johana","b",1 +2024-08-27,53,"Johana","e",2 +2024-08-28,71,"Johana","d",3 +2024-08-29,43,"Collin","e",4 +2024-08-30,93,"Johana","b",5 +2024-08-31,4,"Collin","d",6 +2024-09-01,74,"Johana","e",7 +2024-09-02,13,"Johana","c",8 +2024-09-03,46,"Johana","e",9 +2024-09-04,44,"Collin","a",10 diff --git a/vignettes/metadata.Rmd b/vignettes/metadata.Rmd index c60a47d..90f8367 100644 --- a/vignettes/metadata.Rmd +++ b/vignettes/metadata.Rmd @@ -157,7 +157,8 @@ descriptive_metadata <- list ( # create a new client cli <- deposits::depositsClient$new(service = "zenodo", - metadata = descriptive_metadata, sandbox = TRUE) + metadata = descriptive_metadata, + sandbox = TRUE) # create a new deposit item - this creates a placeholder in zenodo @@ -168,58 +169,22 @@ cli$deposit_new() # this will make a datapackage.json item in data_examples cli$deposit_add_resource(path = "data_examples/my_data.csv") +## open the + # Take a peak at the `datapackage.json` file - you'll see the first section # describes the csv file, the second section describes the data in the csv, # the third section contains the descriptive metadata we created -# Add in your metadata - -data_package <- frictionless::read_package("data_examples/datapackage.json") - -# get the schema for a resource in the data package -my_data_schema <- data_package|> - get_schema("my_data") - - -## build up schema based on structural metadata - -for(idx in 1:length(my_data_schema$fields)){ - # item to build out - x <- my_data_schema$fields[[idx]] - for(idy in 1:length(structural_metadata)){ - - y <- structural_metadata[idx,idy][[1]] - # get property name - property_to_add_name <- names(structural_metadata)[idy] - - # skip properties that already exist - if(property_to_add_name %in% names(x)){ - next() - } - - property_to_add_value <- y - names(property_to_add_value) <- property_to_add_name - x <- c(x, property_to_add_value) - } - - # update - my_data_schema$fields[[idx]] <- x -} - -# update the datapackage.json -data_package <- data_package|> - frictionless::remove_resource("my_data") |> - frictionless::add_resource(resource_name = "my_data", - data = "data_examples/my_data.csv", - schema = my_data_schema, - ) - -# write the datapackage.json -frictionless::write_package(data_package,directory = "data_examples/") - -# upload to zenodo - this creates a draft deposit in Zenodo +# Add your structural metadata to the frictionless metadata + +expand_frictionless_metadata(structural_metadata = structural_metadata, + resource_name = "my_data", # name of the file with no extension + resource_path = "data_examples/my_data.csv", + data_package_path = "data_examples/datapackage.json") + +# upload to zenodo - this creates a **draft** deposit in Zenodo cli$deposit_upload_file(path = "data_examples/") -# there are methods for embargoing or restricting +# there are methods for embargoing or restricting deposits in {deposits} ``` diff --git a/vignettes/metadata_examples/structural_metadata.csv b/vignettes/metadata_examples/structural_metadata.csv index 2cbcd24..d250190 100644 --- a/vignettes/metadata_examples/structural_metadata.csv +++ b/vignettes/metadata_examples/structural_metadata.csv @@ -3,4 +3,4 @@ "measurement","distance to ephemeral pan from livestock pen","meters","https://schema.org/Distance",NA,FALSE,FALSE "measured_by","A person who measured the distance",NA,"https://schema.org/name",NA,FALSE,TRUE "site_name","Name of site",NA,"https://schema.org/name",NA,FALSE,TRUE -"key",NA,NA,NA,NA,TRUE,FALSE +"key","unique id for each row",NA,"https://schema.org/identifier",NA,TRUE,FALSE From 9b0dec265dae032d060bf814ad287963282663d0 Mon Sep 17 00:00:00 2001 From: Collin Schwantes Date: Wed, 28 Aug 2024 12:47:36 -0600 Subject: [PATCH 8/8] Delete vignettes/data_examples/datapackage.json --- vignettes/data_examples/datapackage.json | 79 ------------------------ 1 file changed, 79 deletions(-) delete mode 100644 vignettes/data_examples/datapackage.json diff --git a/vignettes/data_examples/datapackage.json b/vignettes/data_examples/datapackage.json deleted file mode 100644 index 25714e9..0000000 --- a/vignettes/data_examples/datapackage.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "resources": [ - { - "name": "my_data", - "path": "my_data.csv", - "profile": "tabular-data-resource", - "format": "csv", - "mediatype": "text/csv", - "encoding": "UTF-8", - "schema": { - "fields": [ - { - "name": "date", - "type": "date", - "description": "Date measurement was taken", - "units": null, - "term_uri": "https://schema.org/Date", - "comments": "Be careful when reading to excel. Date format may be changed", - "primary_key": false, - "foreign_key": false - }, - { - "name": "measurement", - "type": "number", - "description": "distance to ephemeral pan from livestock pen", - "units": "meters", - "term_uri": "https://schema.org/Distance", - "comments": null, - "primary_key": false, - "foreign_key": false - }, - { - "name": "measured_by", - "type": "string", - "description": "A person who measured the distance", - "units": null, - "term_uri": "https://schema.org/name", - "comments": null, - "primary_key": false, - "foreign_key": true - }, - { - "name": "site_name", - "type": "string", - "description": "Name of site", - "units": null, - "term_uri": "https://schema.org/name", - "comments": null, - "primary_key": false, - "foreign_key": true - }, - { - "name": "key", - "type": "number", - "description": "unique id for each row", - "units": null, - "term_uri": "https://schema.org/identifier", - "comments": null, - "primary_key": true, - "foreign_key": false - } - ] - } - } - ], - "metadata": { - "creator": [ - { - "name": "A. Person" - }, - { - "name": "B. Person" - } - ], - "description": "This is the abstract", - "identifier": "10.5281/zenodo.104126", - "title": "Example Dataset" - } -}