From 8cd9a35dc33963b30018e37f81690152cdc67d8b Mon Sep 17 00:00:00 2001
From: Erika Rasnick <45949235+erikarasnick@users.noreply.github.com>
Date: Mon, 18 Jul 2022 10:50:31 -0400
Subject: [PATCH] =?UTF-8?q?don't=20pad=204=20digit=20tracts=20and=20change?=
 =?UTF-8?q?=20column=20names=20to=20start=20with=20census=E2=80=A6=20(#25)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* don't pad 4 digit tracts and change column names to start with census instead of fips

* readme changes
---
 00_make_block_group_shp.R                     | 50 +++++++++----------
 01_get_2020_block_groups.R                    |  8 ++-
 Dockerfile                                    |  2 +-
 README.md                                     | 30 ++++++-----
 entrypoint.R                                  |  4 +-
 ...geocoded_census_block_group_0.6.0_1970.csv |  6 +++
 ...geocoded_census_block_group_0.6.0_1980.csv |  6 +++
 ...geocoded_census_block_group_0.6.0_1990.csv |  6 +++
 ...geocoded_census_block_group_0.6.0_2000.csv |  6 +++
 ...geocoded_census_block_group_0.6.0_2010.csv |  6 +++
 ...geocoded_census_block_group_0.6.0_2020.csv |  6 +++
 11 files changed, 85 insertions(+), 45 deletions(-)
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_1970.csv
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_1980.csv
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_1990.csv
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_2000.csv
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_2010.csv
 create mode 100644 test/my_address_file_geocoded_census_block_group_0.6.0_2020.csv

diff --git a/00_make_block_group_shp.R b/00_make_block_group_shp.R
index 8b19556..cd058f3 100644
--- a/00_make_block_group_shp.R
+++ b/00_make_block_group_shp.R
@@ -5,7 +5,7 @@ library(sf)
 blk_grps_sf_2010 <- st_read("nhgis0002_shape/nhgis0002_shapefile_tl2010_us_blck_grp_2010/US_blck_grp_2010.shp")
 
 blk_grps_sf_2010 <- st_transform(blk_grps_sf_2010, crs=5072) %>%
-  dplyr::select(fips_block_group_id_2010 = GEOID10,
+  dplyr::select(census_block_group_id_2010 = GEOID10,
                 geometry)
 
 blk_grps_sf_2010 <- sf::st_make_valid(blk_grps_sf_2010)
@@ -17,9 +17,9 @@ saveRDS(blk_grps_sf_2010, "block_groups_2010_5072.rds")
 blk_grps_sf_2000 <- st_read("nhgis0015_shape/nhgis0015_shapefile_tl2000_us_blck_grp_2000/US_blck_grp_2000.shp")
 
 blk_grps_sf_2000 <- st_transform(blk_grps_sf_2000, crs=5072) %>%
-  dplyr::select(fips_block_group_id_2000 = STFID,
+  dplyr::select(census_block_group_id_2000 = STFID,
                 geometry) %>%
-  mutate(fips_block_group_id_2000 = as.character(fips_block_group_id_2000))
+  mutate(census_block_group_id_2000 = as.character(census_block_group_id_2000))
 
 blk_grps_sf_2000 <- sf::st_make_valid(blk_grps_sf_2000)
 
@@ -31,17 +31,13 @@ blk_grps_sf_1990 <- st_read("nhgis0016_shape/nhgis0016_shapefile_tl2000_us_blck_
 
 blk_grps_sf_1990 <- st_transform(blk_grps_sf_1990, crs=5072)
 
-blk_grps_sf_1990 %>%
-  dplyr::mutate(state_fips = stringr::str_sub(string = GISJOIN2, 1, 2))
-
 blk_grps_sf_1990 <- blk_grps_sf_1990 %>%
   dplyr::mutate(state_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 1, 2),
                 county_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 4, 6),
                 tract_fips = stringr::str_sub(blk_grps_sf_1990$GISJOIN2, 8, -2),
-                tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
-                fips_block_group_id_1990 = glue::glue('{state_fips}{county_fips}{tract_fips}{GROUP}')) %>%
-  select(fips_block_group_id_1990, geometry) %>%
-  mutate(fips_block_group_id_1990 = as.character(fips_block_group_id_1990))
+                census_block_group_id_1990 = glue::glue('{state_fips}{county_fips}{tract_fips}{GROUP}')) %>%
+  select(census_block_group_id_1990, geometry) %>%
+  mutate(census_block_group_id_1990 = as.character(census_block_group_id_1990))
 
 blk_grps_sf_1990 <- sf::st_make_valid(blk_grps_sf_1990)
 
@@ -54,14 +50,13 @@ tracts_sf_1980 <- st_read('nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1
 tracts_sf_1980 <- st_transform(tracts_sf_1980, crs=5072)
 
 tracts_sf_1980 <- tracts_sf_1980 %>%
-  dplyr::mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
-                county_fips = stringr::str_sub(NHGISCTY, 1, 3),
-                tract_fips = stringr::str_sub(GISJOIN, 9, -1),
-                tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
-                fips_tract_id_1980 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
-  dplyr::select(fips_tract_id_1980,
+  mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
+         county_fips = stringr::str_sub(NHGISCTY, 1, 3),
+         tract_fips = stringr::str_sub(GISJOIN2, 8, -1),
+         census_tract_id_1980 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
+  dplyr::select(census_tract_id_1980,
                 geometry) %>%
-  mutate(fips_tract_id_1980 = as.character(fips_tract_id_1980))
+  mutate(census_tract_id_1980 = as.character(census_tract_id_1980))
 
 tracts_sf_1980 <- sf::st_make_valid(tracts_sf_1980)
 
@@ -69,19 +64,24 @@ saveRDS(tracts_sf_1980, "tracts_1980_5072.rds")
 # s3 location s3://geomarker/geometries/tracts_1980_5072.rds
 
 # 1970
-tracts_sf_1970 <- st_read('nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1970/US_tract_1970.shp')
+tracts_sf_1970 <- st_read('~/Downloads/nhgis0018_shape/nhgis0018_shapefile_tl2000_us_tract_1970/US_tract_1970.shp')
 
 tracts_sf_1970 <- st_transform(tracts_sf_1970, crs=5072)
 
+## remove incomplete tract identifiers (e.g. "3600050nodata")
+remove <- tracts_sf_1970[duplicated(tracts_sf_1970$GISJOIN2),]$GISJOIN2
+
+tracts_sf_1970 <- tracts_sf_1970 %>%
+  filter(!GISJOIN2 %in% remove)
+
 tracts_sf_1970 <- tracts_sf_1970 %>%
-  dplyr::mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
-                county_fips = stringr::str_sub(NHGISCTY, 1, 3),
-                tract_fips = stringr::str_sub(GISJOIN, 9, -1),
-                tract_fips = stringr::str_pad(tract_fips, 6, pad = "0"),
-                fips_tract_id_1970 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
-  dplyr::select(fips_tract_id_1970,
+  mutate(state_fips = stringr::str_sub(NHGISST, 1, 2),
+         county_fips = stringr::str_sub(NHGISCTY, 1, 3),
+         tract_fips = stringr::str_sub(GISJOIN2, 8, -1),
+         census_tract_id_1970 = glue::glue('{state_fips}{county_fips}{tract_fips}')) %>%
+  dplyr::select(census_tract_id_1970,
                 geometry) %>%
-  mutate(fips_tract_id_1970 = as.character(fips_tract_id_1970))
+  mutate(census_tract_id_1970 = as.character(census_tract_id_1970))
 
 tracts_sf_1970 <- sf::st_make_valid(tracts_sf_1970)
 
diff --git a/01_get_2020_block_groups.R b/01_get_2020_block_groups.R
index bee21de..7996454 100644
--- a/01_get_2020_block_groups.R
+++ b/01_get_2020_block_groups.R
@@ -26,6 +26,7 @@ states <-
 
 block_group_shp <- pmap(list(states$fl_name, states$state_folder, states$STATEFP),
                         possibly(get_block_group_shp, NA_real_))
+#block_group_shp[[6]] <- get_block_group_shp(states$fl_name[6], states$state_folder[6], states$STATEFP[6])
 names(block_group_shp) <- states$NAME
 
 block_group_shp_all <- block_group_shp[[1]]
@@ -35,13 +36,10 @@ for (i in 2:length(block_group_shp)) {
 }
 
 blk_grps_sf_2020 <- block_group_shp_all %>%
-  dplyr::select(fips_block_group_id_2020 = GEOID20,
+  dplyr::select(census_block_group_id_2020 = GEOID20,
                 geometry) %>%
-  mutate(fips_block_group_id_2020 = as.character(fips_block_group_id_2020))
+  mutate(census_block_group_id_2020 = as.character(census_block_group_id_2020))
 
 saveRDS(blk_grps_sf_2020, "block_groups_2020_5072.rds")
 
 
-
-
-
diff --git a/Dockerfile b/Dockerfile
index d76666d..0e5fa92 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM rocker/r-ver:4.0.4
 
 # DeGAUSS container metadata
 ENV degauss_name="census_block_group"
-ENV degauss_version="0.5.1"
+ENV degauss_version="0.6.0"
 ENV degauss_description="census block group and tract"
 ENV degauss_argument="census year [default: 2010]"
 
diff --git a/README.md b/README.md
index b5cf211..13a92a9 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,13 @@
 If `my_address_file_geocoded.csv` is a file in the current working directory with coordinate columns named `lat` and `lon`, then the [DeGAUSS command](https://degauss.org/using_degauss.html#DeGAUSS_Commands):
 
 ```sh
-docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.5.1 my_address_file_geocoded.csv
+docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.6.0 my_address_file_geocoded.csv
 ```
 
-will produce `my_address_file_geocoded_census_block_group_0.5.1_2010.csv` with added columns:
+will produce `my_address_file_geocoded_census_block_group_0.6.0_2010.csv` with added columns:
 
-- **`fips_block_group_id_2010`**: identifier for 2010 block group
-- **`fips_tract_id_2010`**: identifier for 2010 tract
+- **`census_block_group_id_2010`**: identifier for 2010 block group
+- **`census_tract_id_2010`**: identifier for 2010 tract
 
 ### Optional Argument
 
@@ -24,12 +24,24 @@ The default census year is 2010, but can be changed by supplying an optional arg
 docker run --rm -v $PWD:/tmp ghcr.io/degauss-org/census_block_group:0.5.0 my_address_file_geocoded.csv 1990
 ```
 
-will produce `my_address_file_geocoded_census_block_group_0.5.1_1990.csv`, with columns called **`fips_block_group_id_1990`** and **`fips_tract_id_1990`**. 
+will produce `my_address_file_geocoded_census_block_group_0.6.0_1990.csv`, with columns called **`census_block_group_id_1990`** and **`census_tract_id_1990`**. 
 
 Available years for census block group and census tract identifiers include 1990, 2000, 2010, and 2020. Additionally, tracts identifiers are available for 1970 and 1980.
 
+## st_census_tract
+
+For spatiotemporal data in which each location is associated with a specified date range, consider using the [`st_census_tract`](https://degauss.org/st_census_tract/) container, which adds census tract identifiers for the appropriate vintage (1970-2020) based on `start_date` and `end_date` for each input location.
+
 ## Geomarker Methods
 
+- Block group shape files were downloaded from [nhgis.org](nhgis.org) and reprojected to EPSG 5072. 
+
+- All shape files were made valid using `sf::st_make_valid`.
+
+- 2020 block groups were not yet available via NHGIS, and were downloaded directly from the [U.S. Census](www2.census.gov).
+
+## Geomarker Data
+
 - Census block groups are a low level designation within the US Census geographical hierarchy, one degree finer than a census tract. The US Census provides a diagram visualizing the [hierarchy](https://www2.census.gov/geo/pdfs/reference/geodiagram.pdf).
 - The first 11 characters in a census block group GEOID indicate the census tract, county and state that the block group lies within. The US Census GEOIDs are constructed in a manner that reflects the geographical hierary of the designated area. By using the segments of the GEOID, it is possible to select data based on area types further up in the hierarchy.
 
@@ -40,13 +52,7 @@ Available years for census block group and census tract identifiers include 1990
     | Census Tract | State + County + Tract | 2+3+6=11 | Tract 32 in Hamilton County | 39061003200 | 
     | Block Group | State + County + Tract +<br /> Block Group | 2+3+6+1=12 | Block Group 1 in Tract 32 | 390610032001 |
     
-Due to inconsistencies in the 1970 and 1980 tract identifiers, we concatenated the state FIPS (`NHGISST`), county FIPS (`NGHISCTY`), and tract FIPS (the last 4 or 6 digits of `GISJOIN2`) to construct the full `fips_tract_id`. Since the length of tract FIPS codes varied, we padded all tract FIPS to the maximum 6 digits using zeros. 
-
-## st_census_tract
-
-For spatiotemporal data in which each location is associated with a specified date range, consider using the [`st_census_tract`](https://degauss.org/st_census_tract/) container, which adds census tract identifiers for the appropriate vintage (1970-2020) based on `start_date` and `end_date` for each input location.
-
-## Geomarker Data
+*Block Group identifiers are defined as the concatenation of the state, county, tract, and block group fips identifiers (commonly called GISJOIN or GEOID in census data). All census tract identifiers are 11 digits and all census block group identifiers are 12 digits, with the exception of some 1990, 1980, and 1970 tracts that are 9 digits, resulting in 10 digit block group identifiers.*
 
 - block group shapefiles for 1990, 2000, and 2010, as well as tract shapefiles for 1970 and 1980, were obtained from [NHGIS](https://www.nhgis.org/) and transformed using the `00_make_block_group_shp.R` file in this repository.
 
diff --git a/entrypoint.R b/entrypoint.R
index f7b2975..b674340 100755
--- a/entrypoint.R
+++ b/entrypoint.R
@@ -53,8 +53,8 @@ d$d <- suppressWarnings( sf::st_join(d$d, geography, left = FALSE, largest = TRU
 
 if(! opt$census_year %in% c('1980', '1970')) {
   d$d <- d$d %>%
-    mutate_at(vars(starts_with(glue::glue('fips_block_group_id_{opt$census_year}'))),
-              list(fips_tract_id = ~stringr::str_sub(.x, 1, 11)))
+    mutate_at(vars(starts_with(glue::glue('census_block_group_id_{opt$census_year}'))),
+              list(census_tract_id = ~stringr::str_sub(.x, 1, 11)))
 
   names(d$d)[ncol(d$d)] <- glue::glue('{names(d$d)[ncol(d$d)]}_{opt$census_year}')
 }
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_1970.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_1970.csv
new file mode 100644
index 0000000..d4e74e7
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_1970.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_tract_id_1970
+55001310120,NA,NA,6/11/20,6/18/20,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,39061020801
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,39061021502
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_1980.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_1980.csv
new file mode 100644
index 0000000..90b7702
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_1980.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_tract_id_1980
+55001310120,NA,NA,6/11/20,6/18/20,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,39061020801
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,39061021505
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_1990.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_1990.csv
new file mode 100644
index 0000000..e21f241
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_1990.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_block_group_id_1990,census_tract_id_1990
+55001310120,NA,NA,6/11/20,6/18/20,NA,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208011,39061020801
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,3906100543,3906100543
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,3906100543,3906100543
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_2000.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_2000.csv
new file mode 100644
index 0000000..454dfd1
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_2000.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_block_group_id_2000,census_tract_id_2000
+55001310120,NA,NA,6/11/20,6/18/20,NA,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054001,39061005400
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054001,39061005400
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_2010.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_2010.csv
new file mode 100644
index 0000000..50b7564
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_2010.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_block_group_id_2010,census_tract_id_2010
+55001310120,NA,NA,6/11/20,6/18/20,NA,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610054001,39061005400
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610054001,39061005400
diff --git a/test/my_address_file_geocoded_census_block_group_0.6.0_2020.csv b/test/my_address_file_geocoded_census_block_group_0.6.0_2020.csv
new file mode 100644
index 0000000..8ee4cd1
--- /dev/null
+++ b/test/my_address_file_geocoded_census_block_group_0.6.0_2020.csv
@@ -0,0 +1,6 @@
+id,lat,lon,start_date,end_date,census_block_group_id_2020,census_tract_id_2020
+55001310120,NA,NA,6/11/20,6/18/20,NA,NA
+55000100280,39.19674,-84.582601,3/1/17,3/8/17,390610208111,39061020811
+55000100281,39.28765,-84.510173,1/30/12,2/6/12,390610215051,39061021505
+55000100282,39.158521,-84.417572,12/1/20,12/8/20,390610276001,39061027600
+55000100283,39.158521,-84.417572,4/8/19,4/15/19,390610276001,39061027600