diff --git a/.github/workflows/check-full.yaml b/.github/workflows/check-full.yaml new file mode 100644 index 0000000..7c6cbcf --- /dev/null +++ b/.github/workflows/check-full.yaml @@ -0,0 +1,47 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' + +name: R-CMD-check (full) +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true \ No newline at end of file diff --git a/.github/workflows/check-release.yaml b/.github/workflows/check-release.yaml deleted file mode 100644 index d5d7593..0000000 --- a/.github/workflows/check-release.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/master/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] - -name: R-CMD-check - -jobs: - R-CMD-check: - if: ${{ !contains(github.event.head_commit.message, '#skip_ci') }} - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes - steps: - - uses: actions/checkout@v2 - - - uses: r-lib/actions/setup-r@v1 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v1 - with: - extra-packages: rcmdcheck - - - uses: r-lib/actions/check-r-package@v1 diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml index 59d45f5..8ef4f9e 100644 --- a/.github/workflows/check-standard.yaml +++ b/.github/workflows/check-standard.yaml @@ -1,43 +1,29 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/master/examples +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - workflow_dispatch: + push: + branches: [main, master] + pull_request: + branches: [main, master] -name: R-CMD-check +name: R-CMD-check (standard) jobs: R-CMD-check: - runs-on: ${{ matrix.config.os }} - - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - - strategy: - fail-fast: false - matrix: - config: - - {os: macOS-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} - + runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes - steps: - - uses: actions/checkout@v2 - - - uses: r-lib/actions/setup-pandoc@v1 + - uses: actions/checkout@v4 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: - r-version: ${{ matrix.config.r }} - http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - - uses: r-lib/actions/setup-r-dependencies@v1 + - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: rcmdcheck + extra-packages: any::rcmdcheck + needs: check - - uses: r-lib/actions/check-r-package@v1 + - uses: r-lib/actions/check-r-package@v2 \ No newline at end of file diff --git a/.github/workflows/rogtemplate-gh-pages.yaml b/.github/workflows/rogtemplate-gh-pages.yaml index dfa8623..5d25b7a 100644 --- a/.github/workflows/rogtemplate-gh-pages.yaml +++ b/.github/workflows/rogtemplate-gh-pages.yaml @@ -1,5 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/master/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# Workflow triggering derived from: https://stevenmortimer.com/running-github-actions-sequentially/ on: push: branches: [main, master] @@ -18,7 +19,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -33,7 +34,7 @@ jobs: local::. any::pkgdown ropengov/rogtemplate - any::magick + any::rcmdcheck - name: Build logo if not present and prepare template run: | @@ -52,6 +53,6 @@ jobs: - name: Deploy package run: | - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' \ No newline at end of file diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index cd9fff7..8cd78cd 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1,11 +1,10 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/master/examples +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: branches: [main, master] pull_request: branches: [main, master] - workflow_dispatch: name: test-coverage @@ -14,18 +13,20 @@ jobs: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - - uses: r-lib/actions/setup-r-dependencies@v1 + - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: covr + extra-packages: any::covr + needs: coverage - name: Test coverage - run: covr::codecov() - shell: Rscript {0} + run: covr::codecov(token = Sys.getenv("CODECOV_TOKEN"), quiet = FALSE) + shell: Rscript {0} \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 30aaba6..1e7384b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: hetu Type: Package Title: Structural Handling of Finnish Personal Identity Codes -Version: 1.0.7 -Date: 2022-05-20 +Version: 1.1.0.9000 +Date: 2024-11-19 Authors@R: c( person(given = "Pyry", @@ -43,7 +43,8 @@ Depends: Imports: lubridate, checkmate, - parallel + parallel, + methods Suggests: Cairo, knitr, @@ -51,7 +52,7 @@ Suggests: rmarkdown, covr, dplyr -RoxygenNote: 7.2.0 +RoxygenNote: 7.3.2 X-schema.org-isPartOf: http://ropengov.org/ X-schema.org-keywords: ropengov Config/Needs/website: diff --git a/NAMESPACE b/NAMESPACE index c5f1436..8832d55 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,9 @@ # Generated by roxygen2: do not edit by hand +S3method("[",diagnostic) +S3method(plot,diagnostic) +S3method(print,summary.diagnostic) +S3method(summary,diagnostic) export(bid_ctrl) export(hetu) export(hetu_age) @@ -8,6 +12,7 @@ export(hetu_ctrl) export(hetu_date) export(hetu_diagnostic) export(hetu_sex) +export(is.diagnostic) export(pin_age) export(pin_ctrl) export(pin_date) @@ -23,10 +28,14 @@ export(satu_ctrl) importFrom(checkmate,assert_choice) importFrom(checkmate,assert_date) importFrom(checkmate,assert_double) +importFrom(graphics,par) +importFrom(graphics,text) importFrom(lubridate,days) importFrom(lubridate,interval) importFrom(lubridate,period) importFrom(lubridate,weeks) importFrom(lubridate,years) importFrom(lubridate,ymd) +importFrom(methods,hasArg) importFrom(parallel,mclapply) +importFrom(utils,hasName) diff --git a/NEWS.md b/NEWS.md index 7fc0697..efde65c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,11 @@ # *News* ========== +# hetu 1.1.0 (2024-11-19) + +* Add summary method and plot methods for data.frames produced by `hetu_diagnostic()` +* Add support for new century markers. + # hetu 1.0.7.9000 (2022-05-16) * subsetting-parameter (TRUE or FALSE) dropped from `hetu_diagnostic()` function as it was unnecessary syntactic sugar that was difficult to communicate to users. Similar functionalities can be easily achieved with standard subsetting functionalities found in base R and especially in tidyverse. diff --git a/R/hetu.R b/R/hetu.R index 31d502f..36b3f4b 100644 --- a/R/hetu.R +++ b/R/hetu.R @@ -15,6 +15,9 @@ #' "\code{correct.ctrl.char}", "\code{valid.date}", "\code{valid.day}", #' "\code{valid.month}", "\code{valid.length}", "\code{valid.century}". #' Default is \code{FALSE} which returns no diagnostic information. +#' @param as.factor Makes fields "\code{sex}", "\code{p.num}", +#' "\code{ctrl.char}" and "\code{century}" into factors for slightly reduced +#' memory footprint. Default is FALSE. #' @return Finnish personal identity code data.frame, #' or if extract parameter is set, the requested part of the #' information as a vector. Returns an error or \code{NA} if the given @@ -23,20 +26,81 @@ #' A correct pin should be in the form DDMMYYCZZZQ, where DDMMYY stands for #' date, C for century sign, ZZZ for personal number and Q for control #' character.} -#' \item{sex}{sex of the person as a character vector ("Male" or "Female").} -#' \item{p.num}{Personal number part of the identity code.} -#' \item{ctrl.char}{Control character for the personal identity code.} -#' \item{date}{Birthdate.} -#' \item{day}{Day of the birthdate.} -#' \item{month}{Month of the birthdate.} -#' \item{year}{Year of the birthdate.} -#' \item{century}{Century character of the birthdate: + (1800), - (1900) or -#' A (2000).} +#' \item{sex}{sex of the person as a character vector ("Male" or "Female")} +#' \item{p.num}{Personal number (individual number) part of the identity code} +#' \item{ctrl.char}{Control character for the personal identity code} +#' \item{date}{Birthdate} +#' \item{day}{Day of the birthdate} +#' \item{month}{Month of the birthdate} +#' \item{year}{Year of the birthdate} +#' \item{century}{Century character determining the century (1800s, 1900s or +#' 2000s) of the person's birth. See details for more information} #' \item{valid.pin}{Does the personal identity code pass all validity #' checks: (\code{TRUE} or \code{FALSE})} +#' +#' @details +#' +#' Starting from 1st of January 2023, an amendment to the government decree on +#' the Population Information System (128/2010) has expanded the +#' number of available century markers (See references: Valtioneuvoston asetus +#' VM/2022/124) and scrapped some old practices. +#' +#' For the users of this package the most visible change will be that +#' people born in the 1900s can now be assigned with "Y", "X", "W", "V" or "U", +#' in addition to the old "-" (slash) marker. People born in the 2000s can be +#' assigned with "B", "C", "D", E" or "F", in addition to the old marker, "A". +#' For people born in the 1800s "+" (plus sign) remains the only valid marker. +#' The amendment does not affect already existing personal identity codes. +#' +#' The change was done to mitigate for the diminishing pool of available, unique +#' identity codes. For historical reasons, the century marker of +#' the code was not always taken into account when determining the uniqueness +#' of the number. This meant that individual number parts were not recycled +#' between people born in different centuries, diminishing the amount of +#' available numbers for people born in the new century. +#' For example, if a female born in the 1st of January 1901 +#' was assigned with the personal identity code "010101-0101" (individual code +#' part "010"), a female born in 1st of January 2001 could not be assigned with +#' the code "010101A0101" because it would contain the same individual code +#' as the person born in 1901 and individual codes could not be recycled. With +#' the amended decree the uniqueness of the personal identity code is considered +#' by looking at the personal identity code as a whole. This means that from now +#' on it would be permissible to have personal identity codes such as +#' "100190-999P" and "100190Y999P" at the same time, denoting two different +#' individuals (see references: Digital and population data services agency +#' announcement). +#' +#' In practice, codes with new separators will be issued only when the ranges +#' ranges with currently used separators run out. This means that it might +#' take a while until we see people born in the 2000s assigned with the century +#' marker "C" or people born in the 1900s assigned with the century marker "X", +#' as there are still plenty of numbers in ranges "B" and "Y" as well, in +#' addition to some numbers being left in the original ranges of "A" and "-". +#' The first personal identity code with a new separator "Y" was assigned +#' in December 2023 (see Digi- ja väestötietovirasto 2023). +#' +#' The result of all this is that the hetu package may now give "unrealistic" +#' personal identity codes in the sense that some codes are not yet actually +#' in use. However, it is not the aim of this package to simulate the +#' actual distributions of personal identity codes and their century markers in +#' the population (the actually used and unused codes are unknown to us), +#' but to provide a tool that can be used to extract data from these codes, +#' should the user encounter them at some point. Writing further sanity checks +#' is probably a good idea for people who are interested in detecting unusual +#' patterns in their databases and registries. +#' +#' @references +#' +#' Valtioneuvoston asetus VM/2022/124 \href{https://vm.fi/paatos?decisionId=0900908f807c5f3c}{Valtioneuvoston asetus VM/2022/124} +#' +#' Digi- ja väestötietovirasto. (2023). \href{https://dvv.fi/-/uudet-valimerkit-takaavat-henkilotunnusten-riittavyyden-ensimmainen-uudenlainen-henkilotunnus-myonnettiin-talla-viikolla}{Uudet välimerkit takaavat henkilötunnusten riittävyyden - ensimmäinen uudenlainen henkilötunnus myönnettiin tällä viikolla} +#' +#' Digital and Population Data Services Agency. \href{https://dvv.fi/en/reform-of-personal-identity-code}{Reform of the separators in the personal identity code} +#' #' @author Pyry Kantanen, Jussi Paananen -#' @seealso \code{\link{pin_ctrl}} For validating Finnish personal -#' identity codes. +#' @seealso +#' \code{\link{pin_ctrl}} Validating Finnish personal identity codes. +#' \code{\link{rhetu}} Generating random Finnish personal identity codes. #' @examples #' hetu("111111-111C") #' hetu("111111-111C")$date @@ -47,16 +111,20 @@ #' hetu(c("010101-0101", "111111-111C")) #' # Process a vector of hetu's and extract sex information from each #' hetu(c("010101-0101", "111111-111C"), extract="sex") +#' # Process codes with new century markers +#' new_codes <- c("010594Y9032", "010594Y9021", "020594X903P") +#' hetu(new_codes) #' #' @importFrom checkmate assert_choice #' #' @export -hetu <- function(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE) { +hetu <- function(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE, + as.factor = FALSE) { if (!is.null(extract)) { valid_choices <- c("hetu", "sex", "p.num", "ctrl.char", - "date", "day", "month", "year", "century", - "valid.pin") + "date", "day", "month", "year", "century", + "valid.pin") if (allow.temp == FALSE) { checkmate::assert_choice(extract, valid_choices) } else if (allow.temp == TRUE) { @@ -91,18 +159,24 @@ hetu <- function(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE) { # Check century extracted_century_marker <- substr(pin, start = 7, stop = 7) - valid_century_test <- extracted_century_marker %in% c("+", "-", "A") + valid_century_test <- extracted_century_marker %in% c("+", "-", "A", "B", "C", + "D", "E", "F", "Y", "X", + "W", "V", "U") # Construct a full year based on century marker full_year_function <- function(pin) { extracted_century_marker <- substr(pin, start = 7, stop = 7) year <- as.character(substr(pin, start = 5, stop = 6)) - switch(extracted_century_marker, - "+" = as.numeric(paste0("18", year)), - "-" = as.numeric(paste0("19", year)), - "A" = as.numeric(paste0("20", year)), - NA - ) + if (extracted_century_marker %in% c("+")) { + res <- paste0("18", year) + } else if (extracted_century_marker %in% c("-", "Y", "X", "W", "V", "U")) { + res <- paste0("19", year) + } else if (extracted_century_marker %in% c("A", "B", "C", "D", "E", "F")) { + res <- paste0("20", year) + } else { + res <- NA + } + as.numeric(res) } # full_year_function takes 1 pin at a time so using vapply full_year <- vapply(pin, @@ -155,16 +229,29 @@ hetu <- function(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE) { valid_pin_test <- apply(test_matrix, 1, all) # Create hetu-object - object <- list(hetu = pin, - sex = extracted_sex, - p.num = extracted_personal_number, - ctrl.char = extracted_ctrl_char, - date = extracted_date, - day = extracted_day, - month = extracted_month, - year = full_year, - century = extracted_century_marker, - valid.pin = valid_pin_test) + if (as.factor == TRUE) { + object <- list(hetu = pin, + sex = as.factor(extracted_sex), + p.num = as.factor(extracted_personal_number), + ctrl.char = as.factor(extracted_ctrl_char), + date = extracted_date, + day = extracted_day, + month = extracted_month, + year = full_year, + century = as.factor(extracted_century_marker), + valid.pin = valid_pin_test) + } else { + object <- list(hetu = pin, + sex = extracted_sex, + p.num = extracted_personal_number, + ctrl.char = extracted_ctrl_char, + date = extracted_date, + day = extracted_day, + month = extracted_month, + year = full_year, + century = extracted_century_marker, + valid.pin = valid_pin_test) + } if (diagnostic == TRUE) { # create hetu-object with diagnostics @@ -187,20 +274,20 @@ hetu <- function(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE) { if (allow.temp == FALSE) { if (is.null(extract)) { object <- subset(quickdf(object), is_temp_test == FALSE) - #Remove temporary PINs - if (dim(object)[1] == 0) { - return(NA) #If all PINs were temporary, return NA - } else { - #If there were at least some allowed pins, return data frame - return(object) - } + #Remove temporary PINs + if (dim(object)[1] == 0) { + return(NA) #If all PINs were temporary, return NA + } else { + #If there were at least some allowed pins, return data frame + return(object) + } } else { object <- subset(quickdf(object), is_temp_test == FALSE) - if (dim(object)[1] == 0) { - return(NA) - } else { - return(unname(do.call("c", object[extract]))) - } + if (dim(object)[1] == 0) { + return(NA) + } else { + return(unname(do.call("c", object[extract]))) + } } } else if (allow.temp == TRUE) { # If temporary PINs are allowed, print the whole data frame normally diff --git a/R/hetu_control_char.R b/R/hetu_control_char.R index 335f0bc..37a2f74 100644 --- a/R/hetu_control_char.R +++ b/R/hetu_control_char.R @@ -45,9 +45,12 @@ hetu_control_char <- function(pin, with.century = TRUE) { stop("Input PINs that only have 10 characters: birthdate, century marker and personal numbers (DDMMYYQZZZ)") } - if (!(substr(pin, start = 7, stop = 7) %in% c("-", "+", "A"))) { - stop("7th character of your PIN needs to be a century marker (-, + or A). -If your PIN does not have it use parameter with.century == FALSE") + if (!(substr(pin, start = 7, stop = 7) %in% c("-", "+", "A", "B", "C", "D", + "E", "F", "Y", "X", "W", "V", + "U"))) { + stop("7th character of your PIN needs to be a valid century marker + (-, +, A or one of the new markers). If your PIN does not have it + use parameter with.century == FALSE") } pin_ddmmyy <- substr(pin, 1, 6) pin_zzz <- substr(pin, 8, 10) @@ -72,7 +75,7 @@ numbers (DDMMYYZZZ)") #' Finnish Unique Identification Number (FINUID, or sähköinen asiointitunnus #' SATU). #' @param pin An incomplete FINUID that has 8 first numbers. -#' @param print.full Should the function print only the whole FINUID-number +#' @param print.full Should the function print only the whole FINUID-number #' (TRUE) or only the control character (FALSE). Default is FALSE. #' @details This method of calculating the control character was devised by #' mathematician Erkki Pale (1962) to detect input errors but also to @@ -84,7 +87,7 @@ numbers (DDMMYYZZZ)") #' #' The method of calculating the control character does not need century #' character and therefore the function has an option to omit it. -#' @return Control character, either a number 0-9 or a letter (length 1 +#' @return Control character, either a number 0-9 or a letter (length 1 #' character). If parameter print.full is set to TRUE, the function returns #' a complete FINUID / SATU number (length 9 characters). #' @seealso diff --git a/R/hetu_diagnostic.R b/R/hetu_diagnostic.R index 41758ed..1f4dedf 100644 --- a/R/hetu_diagnostic.R +++ b/R/hetu_diagnostic.R @@ -1,6 +1,6 @@ ## hetu_diagnostic.R #' @title Diagnostics Tool for Personal Identity Codes -#' @description Prints information on the tests that are used +#' @description Prints information on the tests that are used #' to confirm or reject the validity of each personal identity code. #' @param pin Finnish personal identification number as a character vector, #' or vector of identification numbers as a character vectors @@ -13,20 +13,23 @@ #' @examples #' diagnosis_example <- c("010101-0102", "111111-111Q", #' "010101B0101", "320101-0101", "011301-0101", -#' "010101-01010", "010101-0011") +#' "010101-01010", "010101-0011", "010101-9011", "010101-901S") #' ## Print all diagnostics for various fake personal identity codes #' hetu_diagnostic(diagnosis_example) #' # Extract century-related checks #' hetu_diagnostic(diagnosis_example, extract = "valid.century") -#' @seealso \code{\link{hetu}} for the main function on which +#' # Print a summary in natural language +#' summary(hetu_diagnostic(diagnosis_example)) +#' @seealso \code{\link{hetu}} for the main function on which #' \code{hetu_diagnostic} relies on. #' #' @export hetu_diagnostic <- function(pin, extract = NULL) { diagnostic_params <- c("hetu", "is.temp", "valid.p.num", "valid.ctrl.char", - "correct.ctrl.char", "valid.date", "valid.day", "valid.month", - "valid.year", "valid.length", "valid.century") + "correct.ctrl.char", "valid.date", "valid.day", + "valid.month", "valid.year", "valid.length", + "valid.century") if (!is.null(extract)) { if (!all(extract %in% diagnostic_params)) { @@ -41,6 +44,7 @@ hetu_diagnostic <- function(pin, extract = NULL) { } else { output <- diagnostic_table[, c("hetu", extract)] } + class(output) <- c("diagnostic", "data.frame") return(output) } @@ -53,3 +57,219 @@ hetu_diagnostic <- function(pin, extract = NULL) { #' pin_diagnostic(diagnosis_example) #' @export pin_diagnostic <- hetu_diagnostic + +new_diagnostic <- function(x) { + structure(x, class = c("diagnostic", "data.frame")) +} + +#' @export +#' @noRd +`[.diagnostic` <- function(x, ...) { + new_diagnostic(NextMethod()) +} + +#' @title Is an Object from Class "diagnostic"? +#' @description Returns TRUE if the object has class "diagnostic" +#' @param object Object to be tested +#' @return TRUE or FALSE +#' @export +is.diagnostic <- function(object) { + inherits(object, "diagnostic") +} + +#' @export +#' @noRd +summary.diagnostic <- function(object, ...) { + + diag_params <- c("valid.p.num", "valid.ctrl.char", + "correct.ctrl.char", "valid.date", "valid.day", + "valid.month", "valid.year", "valid.length", "valid.century") + + res <- list() + + res$n_cases <- nrow(object) + res$n_temp_cases <- sum(object$is.temp) + + res$n_regular_and_valid <- sum( + rowSums( + object[which(object$is.temp == FALSE), ][diag_params] + ) == 9 + ) + + res$n_regular_and_invalid <- (res$n_cases - res$n_temp_cases) - res$n_regular_and_valid + + res$n_temp_and_valid <- sum( + rowSums( + object[which(object$is.temp == TRUE), ][diag_params] + ) == 9 + ) + + res$n_temp_and_invalid <- res$n_temp_cases - res$n_temp_and_valid + + res$n_total_valid_cases <- res$n_regular_and_valid + res$n_temp_and_valid + res$n_total_invalid_cases <- res$n_regular_and_invalid + res$n_temp_and_invalid + + object$is.valid <- (rowSums(object[diag_params]) == 9) + class(res) <- "summary.diagnostic" + res +} + +#' @export +#' @noRd +print.summary.diagnostic <- function(x, ...) { + cat("Diagnostics for", x$n_cases, "hetu objects: \n") + cat("Number of valid hetu objects:", x$n_total_valid_cases, "\n") + cat("Number of valid and non-temporary* hetu objects:", + x$n_regular_and_valid, + "\n") + if (x$n_temp_and_valid > 0) { + cat("Number of valid and temporary** hetu objects:", + x$n_temp_and_valid, + "\n\n") + } + cat("Number of invalid hetu objects:", + x$n_total_invalid_cases, + "\n") + if (x$n_regular_and_invalid > 0) { + cat("Number of invalid and non-temporary* hetu objects:", + x$n_regular_and_invalid, + "\n") + } + if (x$n_temp_and_invalid > 0) { + cat("Number of invalid and temporary** hetu objects:", + x$n_temp_and_invalid, + "\n") + } + + if (x$n_regular_and_valid > 0 || x$n_regular_and_invalid > 0) { + cat("\n", + "* non-temporary: p.num in range [002-899]\n") + } + if (x$n_temp_and_invalid > 0 || x$n_temp_and_valid > 0) { + cat("", + "** temporary: p.num in range [900-999]\n\n") + } + + if ((length(x$valid_hetu) - sum(x$valid_hetu)) > 0) { + cat("\n", + "See table output by hetu_diagnostic() for more detailed information", + "\n" + ) + } +} + +#' @title Plotting method for diagnostic class objects +#' @description Creates a concise plot that visualizes TRUE and FALSE cases +#' in a diagnostics data frame +#' @details There seems to be no canonical answer on what to call this type of +#' plot. Some of the names that can be found online when describing a plot for +#' binary response value on an axis are: a one-dimensional scatterplot, +#' a sparkline, a rug plot, or a strip plot / strip chart. +#' @param x a "summary.diagnostic" object +#' @param labels include column labels on y-axis, default is TRUE +#' @param negate.logicals negate TRUE and FALSE logicals, default is FALSE. +#' Sometimes it may be beneficial to emphasize FALSE cases instead of TRUE +#' @param ... Arguments to be passed to methods, such as graphical parameters. +#' For example: +#' \itemize{ +#' \item{\strong{type} what type of plot should be drawn. +#' Default is "h" for histogram / high density vertical lines.} +#' \item{\strong{lwd} line width as double Default is 1.0} +#' } +#' See \code{\link[base]{plot}} and \code{\link[graphics]{par}} for +#' more options +#' @importFrom graphics par text +#' @importFrom methods hasArg +#' @importFrom utils hasName +#' +#' @export +plot.diagnostic <- function(x, + negate.logicals = FALSE, + labels = TRUE, + ...) { + + ellipsis_args <- list(...) + + if (hasName(ellipsis_args, "type")) { + type <- ellipsis_args$type + } else { + type <- "h" + } + if (hasName(ellipsis_args, "lwd")) { + lwd <- ellipsis_args$lwd + } else { + lwd <- 1.0 + } + if (!hasArg(labels)) { + labels <- TRUE + } + + def.par <- par(no.readonly = TRUE) + on.exit(par(def.par)) + + if (negate.logicals == TRUE) { + logicals <- (unname(sapply(x, class)) == "logical") + x[logicals] <- !x[logicals] + } + + if (labels) { + # Marginals with labels + par(mar = c(1, 7.5, 1, 1)) + } else { + # Marginals without labels + par(mar = c(1, 4, 1, 1)) + } + par(mfrow = c(ncol(x), 1)) + par(las = 1) + for (i in 2:ncol(x)) { + testiobjekti <- x[i] + testiobjekti <- unclass(testiobjekti) + class(testiobjekti) <- "data.frame" + # Plots the box + plot(NA, + xlim = c(min(attributes(testiobjekti)$row.names), + max(attributes(testiobjekti)$row.names)), + ylim = c(-0.25, 1.25), + ylab = "", + xlab = letters[i], + xaxt = "n", + yaxt = "n") + + # Ensure that every element is added on top of the last one + par(new = TRUE) + + # Plots the values inside the box, axis ticks, axis labels + plot(testiobjekti, + xlim = c(min(attributes(testiobjekti)$row.names), + max(attributes(testiobjekti)$row.names)), + ylim = c(-0.25, 1.25), + type = type, + lwd = lwd, + lend = 1, + ylab = "", + xlab = "", + yaxt = "n", + cex = 1.0) + + x_min <- par("usr")[1] + + # Plot y-axis labels outside + text(x = (x_min - 0.01), + # In the middle of the bar, outside + y = 0.5, + # Label is the name of the column + # Add empty spaces to give some room between box and label + labels = ifelse(test = negate.logicals, + yes = paste0("!", names(testiobjekti), " "), + no = paste0(names(testiobjekti), " ")), + # Rotation 0 degrees (default is 90) + srt = 0, + # Change clipping region to none + xpd = NA, + # 100 % right-justified + adj = 1, + # Slightly larger text + cex = 1.1) + + } +} diff --git a/R/pin_age.R b/R/pin_age.R index c5b5aef..f910030 100644 --- a/R/pin_age.R +++ b/R/pin_age.R @@ -1,5 +1,5 @@ #' @title Extract Age from Personal Identity Code -#' @description Calculate age in years, months, weeks or days from +#' @description Calculate age in years, months, weeks or days from #' personal identity codes. #' @inheritParams hetu #' @param date Date at which age is calculated. If a vector is provided it @@ -23,7 +23,7 @@ #' #' @export pin_age <- function(pin, - date=Sys.Date(), + date = Sys.Date(), timespan = "years", allow.temp = FALSE) { @@ -50,7 +50,7 @@ pin_age <- function(pin, all_pins[!hetuframe$valid.pin] <- NA if (length(date) > 1) { valid_diff <- !is.na(all_pins) & !is.na(date) - } else{ + } else { valid_diff <- !is.na(all_pins) } pin <- all_pins[valid_diff] diff --git a/R/pin_ctrl.R b/R/pin_ctrl.R index 63544a0..9bd736f 100644 --- a/R/pin_ctrl.R +++ b/R/pin_ctrl.R @@ -90,7 +90,7 @@ bid_ctrl <- function(bid) { if (check == 0) { check <- check } else if (check %in% c(2:10)) { - check <- 11 - check + check <- 11 - check } else { check <- FALSE return(check) diff --git a/R/pin_sex.R b/R/pin_sex.R index d1513d3..d5e8d4f 100644 --- a/R/pin_sex.R +++ b/R/pin_sex.R @@ -1,5 +1,5 @@ #' @title Extract Sex from Personal Identity Code -#' @description Extract sex (as binary) from Finnish personal identification +#' @description Extract sex (as binary) from Finnish personal identification #' code. #' @inheritParams hetu #' @return Factor with label 'Male' and 'Female'. diff --git a/R/rpin.R b/R/rpin.R index 442328d..c4dd579 100644 --- a/R/rpin.R +++ b/R/rpin.R @@ -1,26 +1,44 @@ #' @title Generate Random Personal Identity Codes #' #' @description -#' A function that generates random Finnish personal identity codes +#' A function that generates random Finnish personal identity codes #' (\code{hetu} codes). #' #' @details -#' There is a finite number of valid personal identity codes available per day. -#' More specifically, there are 498 odd personal numbers for males and 498 even -#' personal numbers for females from range 002-899. Additionally there are 50 -#' odd numbers for males and 50 even numbers for females in the temporary -#' personal identity code number range 900-999 that is not normally in use. #' This function will return an error "too few positive probabilities" in -#' sample.int function if you try to generate too many codes in a short enough -#' timeframe. +#' \code{\link{sample.int}} function if you try to generate too many codes +#' in a short enough timeframe. The theoretical upper limit of valid PINs is +#' in the millions, but the number of valid PINs per day used to be 898 PINs +#' at maximum, meaning 327770 for each year. Attempting to generate e.g. +#' a 1000 pins for a timespan of one day would result in an error. #' -#' The theoretical upper limit of valid PINs is in the millions since there are -#' 898 PINs available for each day, 327770 for each year. In practice this -#' number is much lower since same personal number component cannot be -#' "recycled" if it has been used in the past. To illustrate, if an identity -#' code "010101-0101" has already been assigned to someone born in 1901-01-01, -#' a similar code "010101A0101" for someone born in 2001-01-01 could not be -#' used. +#' In practice this theoretical upper limit number was +#' much lower since the old practice was that the same personal number +#' component cannot be "recycled" if it has been used in the past. +#' To illustrate, if an identity code "010101-0101" has already been assigned +#' to someone born in 1901-01-01, a similar code "010101A0101" for someone +#' born in 2001-01-01 could not be used. +#' +#' In hetu package version 1.1.0 we have taken into account a new government +#' decree that increased the amount of valid century markers and therefore +#' increased the amount of valid personal codes per day. Additionally, the +#' decree has made it possible to recycle individual codes, as the century +#' marker is now thought to be a distinguishing character of the personal +#' identity code. +#' +#' However, the current implementation still keeps the old 898 codes per day +#' limit intact, and assigns new century markers with a low probability: old +#' markers "-" and "A" are given a 95 % probability of appearing and the new +#' markers are given a 1 % probability each. +#' +#' In the future this may be altered +#' into a waterfall pattern so that the initial 898 codes for each date +#' get "-" as the century marker, the next 898 get "Y", and so on. +#' This would mean that each day would have 5388 valid codes and the +#' distribution of century markers would be more +#' realistic in the sense that additional century markers are taken into use +#' only after the previous range has been exhausted. However, this would +#' require generating rather large datasets even for basic testing purposes. #' #' @param n number of generated \code{hetu}-pins #' @param start.date Lower limit of generated \code{hetu} dates, @@ -56,17 +74,17 @@ rpin <- function(n, p.temp = 0.0, num.cores = 1) { - start.date <- as.Date(start.date) - end.date <- as.Date(end.date) + start_date <- as.Date(start.date) + end_date <- as.Date(end.date) assert_double(p.temp, 0, 1) assert_double(p.male, 0, 1) - assert_date(end.date, start.date, Sys.Date()) - assert_date(start.date, as.Date("1860-01-01"), end.date) + assert_date(end_date, start_date, Sys.Date()) + assert_date(start_date, as.Date("1860-01-01"), end_date) # Oversample a bit to make up for filtered PINs (duplicates, PINs with # inadequate personal numbers) - rdates <- sample(start.date:end.date, + rdates <- sample(start_date:end_date, size = n, replace = TRUE) @@ -89,26 +107,39 @@ rpin <- function(n, prob_female <- rep(((1 - p.male) * (1 - p.temp)), length(female_nums)) prob_female_temp <- rep(((1 - p.male) * p.temp), length(female_temp)) - p_nums <- unlist( - mclapply(X = dates_table, - FUN = function(x) sample(c(male_nums, female_nums, - male_temp, female_temp), - size = x, - prob = c(prob_male, prob_female, - prob_male_temp, prob_female_temp) - ), - mc.cores = num.cores - ) + p_nums <- + unlist( + mclapply( + X = dates_table, + FUN = function(x) { + sample( + c(male_nums, female_nums, male_temp, female_temp), + size = x, + prob = c(prob_male, prob_female, prob_male_temp, prob_female_temp) + ) + }, + mc.cores = num.cores + ) ) ddmmyyyy <- rep(names(dates_table), times = dates_table) century <- lapply(X = ddmmyyyy, - FUN = function(y) switch(substr(y, 1, 2), - "20" = "A", - "19" = "-", - "18" = "+", - stop("Invalid input"))) + FUN = function(y) { + switch(substr(y, 1, 2), + "20" = sample( + c("A", "B", "C", "D", "E", "F"), + size = 1, + prob = c(0.95, 0.01, 0.01, 0.01, 0.01, 0.01) + ), + "19" = sample( + c("-", "Y", "X", "W", "V", "U"), + size = 1, + prob = c(0.95, 0.01, 0.01, 0.01, 0.01, 0.01) + ), + "18" = "+", + stop("Invalid input")) + }) ddmmyy <- format(as.Date(ddmmyyyy), "%d%m%y") diff --git a/inst/CITATION b/inst/CITATION index 97c8992..ae44130 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -3,11 +3,11 @@ citHeader("Kindly cite the hetu R package as follows:") year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", meta$Date, perl = TRUE) vers <- paste("R package version", meta$Version) -citEntry( - entry="misc", +bibentry( + bibtype="Misc", title = "hetu: Structural Handling of Finnish Personal Identity Codes", author = - personList( + c( person(given ="Pyry", family="Kantanen", email = "pyry.kantanen@gmail.com"), person(given ="Mans", family="Magnusson"), person(given ="Jussi", family="Paananen"), diff --git a/man/hetu.Rd b/man/hetu.Rd index 4e8497c..deafed9 100644 --- a/man/hetu.Rd +++ b/man/hetu.Rd @@ -4,7 +4,13 @@ \alias{hetu} \title{Generic Extraction Tool for Finnish Personal Identity Codes} \usage{ -hetu(pin, extract = NULL, allow.temp = FALSE, diagnostic = FALSE) +hetu( + pin, + extract = NULL, + allow.temp = FALSE, + diagnostic = FALSE, + as.factor = FALSE +) } \arguments{ \item{pin}{Finnish personal identity code(s) as a character vector} @@ -24,6 +30,10 @@ PINs. The checks are "\code{valid.p.num}", "\code{valid.ctrl.char}", "\code{correct.ctrl.char}", "\code{valid.date}", "\code{valid.day}", "\code{valid.month}", "\code{valid.length}", "\code{valid.century}". Default is \code{FALSE} which returns no diagnostic information.} + +\item{as.factor}{Makes fields "\code{sex}", "\code{p.num}", +"\code{ctrl.char}" and "\code{century}" into factors for slightly reduced +memory footprint. Default is FALSE.} } \value{ Finnish personal identity code data.frame, @@ -34,15 +44,15 @@ Finnish personal identity code data.frame, A correct pin should be in the form DDMMYYCZZZQ, where DDMMYY stands for date, C for century sign, ZZZ for personal number and Q for control character.} -\item{sex}{sex of the person as a character vector ("Male" or "Female").} -\item{p.num}{Personal number part of the identity code.} -\item{ctrl.char}{Control character for the personal identity code.} -\item{date}{Birthdate.} -\item{day}{Day of the birthdate.} -\item{month}{Month of the birthdate.} -\item{year}{Year of the birthdate.} -\item{century}{Century character of the birthdate: + (1800), - (1900) or - A (2000).} +\item{sex}{sex of the person as a character vector ("Male" or "Female")} +\item{p.num}{Personal number (individual number) part of the identity code} +\item{ctrl.char}{Control character for the personal identity code} +\item{date}{Birthdate} +\item{day}{Day of the birthdate} +\item{month}{Month of the birthdate} +\item{year}{Year of the birthdate} +\item{century}{Century character determining the century (1800s, 1900s or +2000s) of the person's birth. See details for more information} \item{valid.pin}{Does the personal identity code pass all validity checks: (\code{TRUE} or \code{FALSE})} } @@ -50,6 +60,56 @@ Finnish personal identity code data.frame, Extract embedded information from Finnish personal identity codes (hetu). } +\details{ +Starting from 1st of January 2023, an amendment to the government decree on +the Population Information System (128/2010) has expanded the +number of available century markers (See references: Valtioneuvoston asetus +VM/2022/124) and scrapped some old practices. + +For the users of this package the most visible change will be that +people born in the 1900s can now be assigned with "Y", "X", "W", "V" or "U", +in addition to the old "-" (slash) marker. People born in the 2000s can be +assigned with "B", "C", "D", E" or "F", in addition to the old marker, "A". +For people born in the 1800s "+" (plus sign) remains the only valid marker. +The amendment does not affect already existing personal identity codes. + +The change was done to mitigate for the diminishing pool of available, unique +identity codes. For historical reasons, the century marker of +the code was not always taken into account when determining the uniqueness +of the number. This meant that individual number parts were not recycled +between people born in different centuries, diminishing the amount of +available numbers for people born in the new century. +For example, if a female born in the 1st of January 1901 +was assigned with the personal identity code "010101-0101" (individual code +part "010"), a female born in 1st of January 2001 could not be assigned with +the code "010101A0101" because it would contain the same individual code +as the person born in 1901 and individual codes could not be recycled. With +the amended decree the uniqueness of the personal identity code is considered +by looking at the personal identity code as a whole. This means that from now +on it would be permissible to have personal identity codes such as +"100190-999P" and "100190Y999P" at the same time, denoting two different +individuals (see references: Digital and population data services agency +announcement). + +In practice, codes with new separators will be issued only when the ranges +ranges with currently used separators run out. This means that it might +take a while until we see people born in the 2000s assigned with the century +marker "C" or people born in the 1900s assigned with the century marker "X", +as there are still plenty of numbers in ranges "B" and "Y" as well, in +addition to some numbers being left in the original ranges of "A" and "-". +The first personal identity code with a new separator "Y" was assigned +in December 2023 (see Digi- ja väestötietovirasto 2023). + +The result of all this is that the hetu package may now give "unrealistic" +personal identity codes in the sense that some codes are not yet actually +in use. However, it is not the aim of this package to simulate the +actual distributions of personal identity codes and their century markers in +the population (the actually used and unused codes are unknown to us), +but to provide a tool that can be used to extract data from these codes, +should the user encounter them at some point. Writing further sanity checks +is probably a good idea for people who are interested in detecting unusual +patterns in their databases and registries. +} \examples{ hetu("111111-111C") hetu("111111-111C")$date @@ -60,11 +120,21 @@ hetu("111111-111C", extract="sex") hetu(c("010101-0101", "111111-111C")) # Process a vector of hetu's and extract sex information from each hetu(c("010101-0101", "111111-111C"), extract="sex") +# Process codes with new century markers +new_codes <- c("010594Y9032", "010594Y9021", "020594X903P") +hetu(new_codes) + +} +\references{ +Valtioneuvoston asetus VM/2022/124 \href{https://vm.fi/paatos?decisionId=0900908f807c5f3c}{Valtioneuvoston asetus VM/2022/124} + +Digi- ja väestötietovirasto. (2023). \href{https://dvv.fi/-/uudet-valimerkit-takaavat-henkilotunnusten-riittavyyden-ensimmainen-uudenlainen-henkilotunnus-myonnettiin-talla-viikolla}{Uudet välimerkit takaavat henkilötunnusten riittävyyden - ensimmäinen uudenlainen henkilötunnus myönnettiin tällä viikolla} +Digital and Population Data Services Agency. \href{https://dvv.fi/en/reform-of-personal-identity-code}{Reform of the separators in the personal identity code} } \seealso{ -\code{\link{pin_ctrl}} For validating Finnish personal - identity codes. +\code{\link{pin_ctrl}} Validating Finnish personal identity codes. +\code{\link{rhetu}} Generating random Finnish personal identity codes. } \author{ Pyry Kantanen, Jussi Paananen diff --git a/man/hetu_diagnostic.Rd b/man/hetu_diagnostic.Rd index 23d749c..ac9ca6e 100644 --- a/man/hetu_diagnostic.Rd +++ b/man/hetu_diagnostic.Rd @@ -23,17 +23,19 @@ Valid values are "\code{hetu}", "\code{is.temp}", "\code{valid.p.num}", A data.frame containing diagnostic checks about PINs. } \description{ -Prints information on the tests that are used +Prints information on the tests that are used to confirm or reject the validity of each personal identity code. } \examples{ diagnosis_example <- c("010101-0102", "111111-111Q", "010101B0101", "320101-0101", "011301-0101", -"010101-01010", "010101-0011") +"010101-01010", "010101-0011", "010101-9011", "010101-901S") ## Print all diagnostics for various fake personal identity codes hetu_diagnostic(diagnosis_example) # Extract century-related checks hetu_diagnostic(diagnosis_example, extract = "valid.century") +# Print a summary in natural language +summary(hetu_diagnostic(diagnosis_example)) diagnosis_example <- c("010101-0102", "111111-111Q", "010101B0101", "320101-0101", "011301-0101", "010101-01010", "010101-0011") @@ -41,6 +43,6 @@ diagnosis_example <- c("010101-0102", "111111-111Q", pin_diagnostic(diagnosis_example) } \seealso{ -\code{\link{hetu}} for the main function on which +\code{\link{hetu}} for the main function on which \code{hetu_diagnostic} relies on. } diff --git a/man/is.diagnostic.Rd b/man/is.diagnostic.Rd new file mode 100644 index 0000000..ba7d6c1 --- /dev/null +++ b/man/is.diagnostic.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hetu_diagnostic.R +\name{is.diagnostic} +\alias{is.diagnostic} +\title{Is an Object from Class "diagnostic"?} +\usage{ +is.diagnostic(object) +} +\arguments{ +\item{object}{Object to be tested} +} +\value{ +TRUE or FALSE +} +\description{ +Returns TRUE if the object has class "diagnostic" +} diff --git a/man/pin_age.Rd b/man/pin_age.Rd index 7e793f0..c454be5 100644 --- a/man/pin_age.Rd +++ b/man/pin_age.Rd @@ -31,7 +31,7 @@ use (personal numbers 002-899) are allowed.} Age as an integer vector. } \description{ -Calculate age in years, months, weeks or days from +Calculate age in years, months, weeks or days from personal identity codes. } \examples{ diff --git a/man/pin_sex.Rd b/man/pin_sex.Rd index c02f536..e91e69e 100644 --- a/man/pin_sex.Rd +++ b/man/pin_sex.Rd @@ -20,7 +20,7 @@ use (personal numbers 002-899) are allowed.} Factor with label 'Male' and 'Female'. } \description{ -Extract sex (as binary) from Finnish personal identification +Extract sex (as binary) from Finnish personal identification code. } \examples{ diff --git a/man/plot.diagnostic.Rd b/man/plot.diagnostic.Rd new file mode 100644 index 0000000..9153b31 --- /dev/null +++ b/man/plot.diagnostic.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hetu_diagnostic.R +\name{plot.diagnostic} +\alias{plot.diagnostic} +\title{Plotting method for diagnostic class objects} +\usage{ +\method{plot}{diagnostic}(x, negate.logicals = FALSE, labels = TRUE, ...) +} +\arguments{ +\item{x}{a "summary.diagnostic" object} + +\item{negate.logicals}{negate TRUE and FALSE logicals, default is FALSE. +Sometimes it may be beneficial to emphasize FALSE cases instead of TRUE} + +\item{labels}{include column labels on y-axis, default is TRUE} + +\item{...}{Arguments to be passed to methods, such as graphical parameters. +For example: + \itemize{ + \item{\strong{type} what type of plot should be drawn. + Default is "h" for histogram / high density vertical lines.} + \item{\strong{lwd} line width as double Default is 1.0} + } + See \code{\link[base]{plot}} and \code{\link[graphics]{par}} for + more options} +} +\description{ +Creates a concise plot that visualizes TRUE and FALSE cases +in a diagnostics data frame +} +\details{ +There seems to be no canonical answer on what to call this type of +plot. Some of the names that can be found online when describing a plot for +binary response value on an axis are: a one-dimensional scatterplot, +a sparkline, a rug plot, or a strip plot / strip chart. +} diff --git a/man/rpin.Rd b/man/rpin.Rd index 8f9903f..b8def0b 100644 --- a/man/rpin.Rd +++ b/man/rpin.Rd @@ -46,26 +46,44 @@ Default is 1.} a vector of generated \code{hetu}-pins. } \description{ -A function that generates random Finnish personal identity codes +A function that generates random Finnish personal identity codes (\code{hetu} codes). } \details{ -There is a finite number of valid personal identity codes available per day. -More specifically, there are 498 odd personal numbers for males and 498 even -personal numbers for females from range 002-899. Additionally there are 50 -odd numbers for males and 50 even numbers for females in the temporary -personal identity code number range 900-999 that is not normally in use. This function will return an error "too few positive probabilities" in -sample.int function if you try to generate too many codes in a short enough -timeframe. +\code{\link{sample.int}} function if you try to generate too many codes +in a short enough timeframe. The theoretical upper limit of valid PINs is +in the millions, but the number of valid PINs per day used to be 898 PINs +at maximum, meaning 327770 for each year. Attempting to generate e.g. +a 1000 pins for a timespan of one day would result in an error. -The theoretical upper limit of valid PINs is in the millions since there are -898 PINs available for each day, 327770 for each year. In practice this -number is much lower since same personal number component cannot be -"recycled" if it has been used in the past. To illustrate, if an identity -code "010101-0101" has already been assigned to someone born in 1901-01-01, -a similar code "010101A0101" for someone born in 2001-01-01 could not be -used. +In practice this theoretical upper limit number was +much lower since the old practice was that the same personal number +component cannot be "recycled" if it has been used in the past. +To illustrate, if an identity code "010101-0101" has already been assigned +to someone born in 1901-01-01, a similar code "010101A0101" for someone +born in 2001-01-01 could not be used. + +In hetu package version 1.1.0 we have taken into account a new government +decree that increased the amount of valid century markers and therefore +increased the amount of valid personal codes per day. Additionally, the +decree has made it possible to recycle individual codes, as the century +marker is now thought to be a distinguishing character of the personal +identity code. + +However, the current implementation still keeps the old 898 codes per day +limit intact, and assigns new century markers with a low probability: old +markers "-" and "A" are given a 95 % probability of appearing and the new +markers are given a 1 % probability each. + +In the future this may be altered +into a waterfall pattern so that the initial 898 codes for each date +get "-" as the century marker, the next 898 get "Y", and so on. +This would mean that each day would have 5388 valid codes and the +distribution of century markers would be more +realistic in the sense that additional century markers are taken into use +only after the previous range has been exhausted. However, this would +require generating rather large datasets even for basic testing purposes. } \examples{ x <- rpin(3) diff --git a/man/satu_control_char.Rd b/man/satu_control_char.Rd index 6b753c3..5c5d166 100644 --- a/man/satu_control_char.Rd +++ b/man/satu_control_char.Rd @@ -9,11 +9,11 @@ satu_control_char(pin, print.full = FALSE) \arguments{ \item{pin}{An incomplete FINUID that has 8 first numbers.} -\item{print.full}{Should the function print only the whole FINUID-number +\item{print.full}{Should the function print only the whole FINUID-number (TRUE) or only the control character (FALSE). Default is FALSE.} } \value{ -Control character, either a number 0-9 or a letter (length 1 +Control character, either a number 0-9 or a letter (length 1 character). If parameter print.full is set to TRUE, the function returns a complete FINUID / SATU number (length 9 characters). } diff --git a/tests/testthat/test_hetu.R b/tests/testthat/test_hetu.R index b0d5a4d..bd432c1 100644 --- a/tests/testthat/test_hetu.R +++ b/tests/testthat/test_hetu.R @@ -23,7 +23,7 @@ test_that("hetu() works correctly", { expect_false(hetu("320101-010A")$valid.pin) # check day expect_false(hetu("011301-010P")$valid.pin) # check month expect_false(suppressWarnings(hetu("0101-1-0101")$valid.pin)) # check year - expect_false(hetu("010101B0101")$valid.pin) # check century marker + expect_false(hetu("010101G0101")$valid.pin) # check century marker expect_true(is.na(hetu("290201-010A")$date)) #Check if date exists expect_equal(as.character(hetu("010199+010A")$century), "+") expect_equal(as.character(hetu("010101-0101")$century), "-") @@ -40,6 +40,30 @@ test_that("hetu() works correctly", { expect_true(is.na(hetu(c("010101A900R", "010101A900R")))) expect_false(hetu("010101-01013")$valid.pin) expect_true(!is.null(hetu("010101-0101", diagnostic = TRUE))) + # New personal identity codes + # from Koulutusymparisto_Hetun_uusia_valimerkkeja.xlsx + new_codes <- c("010594Y9032", + "010594Y9021", + "020594X903P", + "020594X902N", + "030594W903B", + "030694W9024", + "040594V9030", + "040594V902Y", + "050594U903M", + "050594U902L", + "010516B903X", + "010516B902W", + "020516C903K", + "020516C902J", + "030516D9037", + "030516D9026", + "010501E9032", + "020502E902X", + "020503F9037", + "020504A902E", + "020504B904H") + expect_true(all(hetu(new_codes, allow.temp = TRUE, extract = "valid.pin"))) }) test_that("pin_ctrl() works correctly", { @@ -57,7 +81,7 @@ test_that("bid_ctrl() works correctly", { #this is intentionally wrong, no infix operator error expect_warning(bid_ctrl(0737546-1)) }) - + test_that("pin_to_date() works correctly and produces deprecation warning", { expect_warning(all((pin_to_date(c("010101-0101", "111111-111C")) == c("1901-01-01", "1911-11-11")))) @@ -71,7 +95,7 @@ test_that("pin_date() works correctly", { expect_true(all((pin_date(c("010101A0101", "111111A111C")) == c("2001-01-01", "2011-11-11")))) }) - + test_that("pin_age() works correctly", { expect_true(pin_age("010101-0101", date = Sys.Date()) > 100) expect_true(all(pin_age(c("010101-0101", "111111-111C"), @@ -112,6 +136,6 @@ test_that("hetu_control_char works correctly", { expect_visible(hetu_control_char("010101010", with.century = FALSE)) expect_visible(hetu_control_char(c("010101-010", "111111-111"))) expect_error(hetu_control_char("010101-0101", with.century = TRUE)) - expect_error(hetu_control_char("010101B010", with.century = TRUE)) + expect_error(hetu_control_char("010101Q010", with.century = TRUE)) expect_error(hetu_control_char("0101010101", with.century = FALSE)) }) diff --git a/vignettes/hetu.Rmd b/vignettes/hetu.Rmd index 7697bec..883e4b3 100644 --- a/vignettes/hetu.Rmd +++ b/vignettes/hetu.Rmd @@ -41,7 +41,7 @@ library(hetu) We also recommend setting the UTF-8 encoding: ```{r locale, eval=FALSE} -Sys.setlocale(locale="UTF-8") +Sys.setlocale(locale = "UTF-8") ``` ## Introduction @@ -186,9 +186,9 @@ table(hetu(temp_sample, allow.temp = TRUE, extract = "is.temp")) In addition to information mentioned in the section [Extracting specific information](#extracting-specific-information), the user can choose to print additional columns containing information about checks done on PINs. The diagnostic checks produce a TRUE or FALSE for the following categories: *valid.p.num*, *valid.checksum*, *correct.checksum*, *valid.date*, *valid.day*, *valid.month*, *valid.year*, *valid.length* and *valid.century*, FALSE meaning that hetu is somehow incorrect. ```{r example_diagnostics1, error = TRUE, purl = FALSE, warning = FALSE} -diagnosis_example <- c("010101-0102", "111111-111Q", -"010101B0101", "320101-0101", "011301-0101", -"010101-01010", "010101-0011") +diagnosis_example <- c("010101-0102", "111111-111Q", + "010101B0101", "320101-0101", "011301-0101", + "010101-01010", "010101-0011") head(hetu(diagnosis_example, diagnostic = TRUE), 3) ``` @@ -201,7 +201,8 @@ tail(hetu_diagnostic(diagnosis_example), 3) By using extract parameter, the user can choose which columns will be printed in the output table. Valid extract values are listed in the function's help file. ```{r example_diagnostics3, error = TRUE, purl = FALSE, warning = FALSE} -hetu_diagnostic(diagnosis_example, extract = c("valid.century", "correct.checksum")) +hetu_diagnostic(diagnosis_example, extract = c("valid.century", + "correct.checksum")) ``` Because of the way PINs are handled in inside hetu-function, the diagnostics-function can show unexpected warning messages or introduce NAs by coercion if the date-part of the PIN is too long. This may result in inability to handle the PIN at all! @@ -243,11 +244,14 @@ library(tidyverse) library(dplyr) # Generate data for this example -hdat<-tibble(pin=rhetu(n = 4, start_date = "1990-01-01", end_date = "2005-01-01")) +hdat <- tibble(pin = rhetu(n = 4, + start.date = "1990-01-01", + end.date = "2005-01-01")) # Extract all the hetu information to tibble format -hdat<-hdat %>% - mutate(result=map(.x=pin,.f=hetu::hetu)) %>% unnest(cols=c(result)) +hdat <- hdat %>% + mutate(result = map(.x = pin, .f = hetu::hetu)) %>% + unnest(cols = c(result)) hdat ``` @@ -265,8 +269,8 @@ citation("hetu") - [The personal identity code](https://dvv.fi/en/personal-identity-code). Digital and population data services agency. - [Valtioneuvoston asetus väestötietojärjestelmästä (128/2010)](https://www.finlex.fi/fi/laki/ajantasa/2010/20100128) (In Finnish). Valtiovarainministeriö. -- [HETU-uudistuksen loppuraportti](http://urn.fi/URN:ISBN:978-952-367-296-3) (In Finnish). Valtiovarainministeriön julkaisuja 2020:20. -- [The Business Information System (BIS)](https://www.prh.fi/en/kaupparekisteri/rekisterointipalvelut/ytj.html). Finnish Patent and Registration Office. +- [HETU-uudistuksen loppuraportti](https://urn.fi/URN:ISBN:978-952-367-296-3) (In Finnish). Valtiovarainministeriön julkaisuja 2020:20. +- [The Business Information System (BIS)](https://www.prh.fi/en/yhdistysrekisteri/business_id.html). Finnish Patent and Registration Office. ## Session info