diff --git a/.github/actions/r_build/action.yml b/.github/actions/r_build/action.yml index 12997393e..412219d47 100644 --- a/.github/actions/r_build/action.yml +++ b/.github/actions/r_build/action.yml @@ -7,6 +7,18 @@ runs: shell: bash run: | sudo apt-get update && sudo apt-get install -y curl libcurl4-openssl-dev pkg-config libharfbuzz-dev libfribidi-dev + - name: Configure python interpreter + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + - name: Install python dependencies + shell: bash + run: | + # - install pip libs + # note: gdal requires the extra args + cd python + pip install build wheel pyspark==${{ matrix.spark }} numpy==${{ matrix.numpy }} + pip install --no-build-isolation --no-cache-dir --force-reinstall gdal==${{ matrix.gdal }} - name: Create download location for Spark shell: bash run: | diff --git a/R/generate_R_bindings.R b/R/generate_R_bindings.R index f60199efb..d4ac99baa 100644 --- a/R/generate_R_bindings.R +++ b/R/generate_R_bindings.R @@ -214,7 +214,7 @@ main <- function(scala_file_path){ closeAllConnections() # supplementary files - sparkr_supplementary_files <- c("sparkR-mosaic/enableMosaic.R") + sparkr_supplementary_files <- c("sparkR-mosaic/enableMosaic.R", "sparkR-mosaic/enableGDAL.R") copy_supplementary_file(sparkr_supplementary_files, "sparkR-mosaic/sparkrMosaic/R") ########################## @@ -226,8 +226,8 @@ main <- function(scala_file_path){ closeAllConnections() # supplementary files - sparkr_supplementary_files <- c("sparklyr-mosaic/enableMosaic.R", "sparklyr-mosaic/sparkFunctions.R") - copy_supplementary_file(sparkr_supplementary_files, "sparklyr-mosaic/sparklyrMosaic/R/") + sparklyr_supplementary_files <- c("sparklyr-mosaic/enableMosaic.R", "sparklyr-mosaic/sparkFunctions.R", "sparklyr-mosaic/enableGDAL.R") + copy_supplementary_file(sparklyr_supplementary_files, "sparklyr-mosaic/sparklyrMosaic/R/") } diff --git a/R/install_deps.R b/R/install_deps.R index c9751b103..1d5e7bc2a 100644 --- a/R/install_deps.R +++ b/R/install_deps.R @@ -1,3 +1,3 @@ options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/jammy/latest")) -install.packages(c("pkgbuild", "testthat", "roxygen2", "sparklyr")) +install.packages(c("pkgbuild", "testthat", "roxygen2", "sparklyr", "readr", "sparklyr.nested")) diff --git a/R/sparkR-mosaic/enableGDAL.R b/R/sparkR-mosaic/enableGDAL.R new file mode 100644 index 000000000..6138ed8f3 --- /dev/null +++ b/R/sparkR-mosaic/enableGDAL.R @@ -0,0 +1,14 @@ +#' enableGDAL +#' +#' @description enableGDAL activates GDAL extensions for Mosaic +#' @name enableGDAL +#' @rdname enableGDAL +#' @return None +#' @export enableGDAL +#' @examples +#' \dontrun{ +#' enableGDAL() } +enableGDAL <- function( +){ + sparkR.callJStatic(x="com.databricks.labs.mosaic.gdal.MosaicGDAL", methodName="enableGDAL", sparkR.session()) +} \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/.Rbuildignore b/R/sparkR-mosaic/sparkrMosaic/.Rbuildignore index 4e2484e14..04802af1f 100644 --- a/R/sparkR-mosaic/sparkrMosaic/.Rbuildignore +++ b/R/sparkR-mosaic/sparkrMosaic/.Rbuildignore @@ -1,2 +1,2 @@ ^sparkrMosaic\.Rproj$ -^\.Rproj\.user$ +^\.Rproj\.user$ \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION index 7bc9e3f62..f689fe17a 100644 --- a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION +++ b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION @@ -1,7 +1,7 @@ Package: sparkrMosaic Title: SparkR bindings for Databricks Mosaic Version: 0.4.0 -Authors@R: +Authors@R: person("Robert", "Whiffin", , "robert.whiffin@databricks.com", role = c("aut", "cre") ) Description: This package extends SparkR to bring the Databricks Mosaic for geospatial processing APIs into SparkR. @@ -9,13 +9,15 @@ License: Databricks Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Collate: +Collate: + 'enableGDAL.R' 'enableMosaic.R' 'generics.R' 'functions.R' Imports: SparkR, methods -Suggests: - testthat (>= 3.0.0) -Config/testthat/edition: 3 +Suggests: + testthat (>= 3.0.0), + readr (>= 2.1.5) +Config/testthat/edition: 3 \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.Rproj b/R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.rproj similarity index 88% rename from R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.Rproj rename to R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.rproj index aaa62a52c..bc70eae8c 100644 --- a/R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.Rproj +++ b/R/sparkR-mosaic/sparkrMosaic/sparkrMosaic.rproj @@ -14,4 +14,4 @@ LineEndingConversion: Posix BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace +PackageRoxygenize: rd,collate,namespace \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/Blocks2020.zip b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/Blocks2020.zip new file mode 100644 index 000000000..7c367746e Binary files /dev/null and b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/Blocks2020.zip differ diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF new file mode 100644 index 000000000..2eae2a2ad Binary files /dev/null and b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF differ diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/boroughs.geojson similarity index 99% rename from R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R rename to R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/boroughs.geojson index 1ad46c114..93f5be529 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/boroughs.geojson @@ -1,4 +1,4 @@ -inputGJ = '{ +{ "type":"Feature", "properties":{ "shape_area":"0.0000607235737749", @@ -225,4 +225,4 @@ inputGJ = '{ ] ] } -}' \ No newline at end of file +} \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc new file mode 100644 index 000000000..33a03c5af Binary files /dev/null and b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc differ diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R new file mode 100644 index 000000000..36296e9d8 --- /dev/null +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R @@ -0,0 +1,140 @@ +generate_singleband_raster_df <- function() { + read.df( + path = "sparkrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF", + source = "gdal", + raster.read.strategy = "in_memory" + ) +} + +test_that("mosaic can read single-band GeoTiff", { + sdf <- generate_singleband_raster_df() + row <- first(sdf) + expect_equal(row$length, 1067862L) + expect_equal(row$x_size, 2400) + expect_equal(row$y_size, 2400) + expect_equal(row$srid, 0) + expect_equal(row$bandCount, 1) + expect_equal(row$metadata[[1]]$LONGNAME, "MODIS/Terra+Aqua BRDF/Albedo Nadir BRDF-Adjusted Ref Daily L3 Global - 500m") + expect_equal(row$tile[[1]]$driver, "GTiff") + +}) + +test_that("scalar raster functions behave as intended", { + sdf <- generate_singleband_raster_df() + sdf <- withColumn(sdf, "rst_rastertogridavg", rst_rastertogridavg(column("tile"), lit(9L))) + sdf <- withColumn(sdf, "rst_rastertogridcount", rst_rastertogridcount(column("tile"), lit(9L))) + sdf <- withColumn(sdf, "rst_rastertogridmax", rst_rastertogridmax(column("tile"), lit(9L))) + sdf <- withColumn(sdf, "rst_rastertogridmedian", rst_rastertogridmedian(column("tile"), lit(9L))) + sdf <- withColumn(sdf, "rst_rastertogridmin", rst_rastertogridmin(column("tile"), lit(9L))) + sdf <- withColumn(sdf, "rst_rastertoworldcoordx", rst_rastertoworldcoordx(column("tile"), lit(1200L), lit(1200L))) + sdf <- withColumn(sdf, "rst_rastertoworldcoordy", rst_rastertoworldcoordy(column("tile"), lit(1200L), lit(1200L))) + sdf <- withColumn(sdf, "rst_rastertoworldcoord", rst_rastertoworldcoord(column("tile"), lit(1200L), lit(1200L))) + sdf <- withColumn(sdf, "rst_rotation", rst_rotation(column("tile"))) + sdf <- withColumn(sdf, "rst_scalex", rst_scalex(column("tile"))) + sdf <- withColumn(sdf, "rst_scaley", rst_scaley(column("tile"))) + sdf <- withColumn(sdf, "rst_srid", rst_srid(column("tile"))) + sdf <- withColumn(sdf, "rst_summary", rst_summary(column("tile"))) + sdf <- withColumn(sdf, "rst_upperleftx", rst_upperleftx(column("tile"))) + sdf <- withColumn(sdf, "rst_upperlefty", rst_upperlefty(column("tile"))) + sdf <- withColumn(sdf, "rst_width", rst_width(column("tile"))) + sdf <- withColumn(sdf, "rst_worldtorastercoordx", rst_worldtorastercoordx(column("tile"), lit(0.0), lit(0.0))) + sdf <- withColumn(sdf, "rst_worldtorastercoordy", rst_worldtorastercoordy(column("tile"), lit(0.0), lit(0.0))) + sdf <- withColumn(sdf, "rst_worldtorastercoord", rst_worldtorastercoord(column("tile"), lit(0.0), lit(0.0))) + + expect_no_error(write.df(sdf, source = "noop", mode = "overwrite")) +}) + +test_that("raster flatmap functions behave as intended", { + retiled_sdf <- generate_singleband_raster_df() + retiled_sdf <- withColumn(retiled_sdf, "rst_retile", rst_retile(column("tile"), lit(1200L), lit(1200L))) + + expect_no_error(write.df(retiled_sdf, source = "noop", mode = "overwrite")) + expect_equal(nrow(retiled_sdf), 4) + + subdivide_sdf <- generate_singleband_raster_df() + subdivide_sdf <- withColumn(subdivide_sdf, "rst_subdivide", rst_subdivide(column("tile"), lit(1L))) + + expect_no_error(write.df(subdivide_sdf, source = "noop", mode = "overwrite")) + expect_equal(nrow(subdivide_sdf), 4) + + tessellate_sdf <- generate_singleband_raster_df() + tessellate_sdf <- withColumn(tessellate_sdf, "rst_tessellate", rst_tessellate(column("tile"), lit(3L))) + + expect_no_error(write.df(tessellate_sdf, source = "noop", mode = "overwrite")) + expect_equal(nrow(tessellate_sdf), 66) + + overlap_sdf <- generate_singleband_raster_df() + overlap_sdf <- withColumn(overlap_sdf, "rst_to_overlapping_tiles", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + + expect_no_error(write.df(overlap_sdf, source = "noop", mode = "overwrite")) + expect_equal(nrow(overlap_sdf), 87) +}) + +test_that("raster aggregation functions behave as intended", { + collection_sdf <- generate_singleband_raster_df() + collection_sdf <- withColumn(collection_sdf, "extent", st_astext(rst_boundingbox(column("tile")))) + collection_sdf <- withColumn(collection_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + + merge_sdf <- summarize( + groupBy(collection_sdf, "path"), + alias(rst_merge_agg(column("tile")), "tile") + ) + merge_sdf <- withColumn(merge_sdf, "extent", st_astext(rst_boundingbox(column("tile")))) + + expect_equal(nrow(merge_sdf), 1) + expect_equal(first(collection_sdf)$extent, first(merge_sdf)$extent) + + combine_avg_sdf <- summarize( + groupBy(collection_sdf, "path"), + alias(rst_combineavg_agg(column("tile")), "tile") + ) + combine_avg_sdf <- withColumn(combine_avg_sdf, "extent", st_astext(rst_boundingbox(column("tile")))) + + expect_equal(nrow(combine_avg_sdf), 1) + expect_equal(first(collection_sdf)$extent, first(combine_avg_sdf)$extent) + +}) + +test_that("the tessellate-join-clip-merge flow works on NetCDF files", { + target_resolution <- 1L + + region_keys <- c("NAME", "STATE", "BOROUGH", "BLOCK", "TRACT") + + census_sdf <- read.df( + path = "sparkrMosaic/tests/testthat/data/Blocks2020.zip", + source = "com.databricks.labs.mosaic.datasource.OGRFileFormat", + vsizip = "true", + chunkSize = "20" + ) + + census_sdf <- select(census_sdf, c(region_keys, "geom_0", "geom_0_srid")) + census_sdf <- distinct(census_sdf) + census_sdf <- withColumn(census_sdf, "geom_0", st_simplify(column("geom_0"), lit(0.001))) + census_sdf <- withColumn(census_sdf, "geom_0", st_updatesrid(column("geom_0"), column("geom_0_srid"), lit(4326L))) + census_sdf <- withColumn(census_sdf, "chip", grid_tessellateexplode(column("geom_0"), lit(target_resolution))) + census_sdf <- select(census_sdf, c(region_keys, "chip.*")) + + raster_sdf <- read.df( + path = "sparkrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc", + source = "gdal", + raster.read.strategy = "retile_on_read" + ) + + raster_sdf <- withColumn(raster_sdf, "tile", rst_separatebands(column("tile"))) + raster_sdf <- withColumn(raster_sdf, "timestep", element_at(rst_metadata(column("tile")), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX")) + raster_sdf <- where(raster_sdf, "timestep = 21") + raster_sdf <- withColumn(raster_sdf, "tile", rst_setsrid(column("tile"), lit(4326L))) + raster_sdf <- withColumn(raster_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(20L), lit(20L), lit(10L))) + raster_sdf <- withColumn(raster_sdf, "tile", rst_tessellate(column("tile"), lit(target_resolution))) + + clipped_sdf <- join(raster_sdf, census_sdf, raster_sdf$tile.index_id == census_sdf$index_id) + clipped_sdf <- withColumn(clipped_sdf, "tile", rst_clip(column("tile"), column("wkb"))) + + merged_precipitation <- summarize( + groupBy(clipped_sdf, "timestep"), + alias(rst_merge_agg(column("tile")), "tile") + ) + + expect_equal(nrow(merged_precipitation), 1) + +}) \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R index 205aee809..154a4cb0f 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R @@ -1,9 +1,7 @@ -source("data.R") - test_that("scalar vector functions behave as intended", { - sdf <- SparkR::createDataFrame( + sdf <- createDataFrame( data.frame( - wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", + wkt = "POLYGON ((2 1, 1 2, 2 3, 2 1))", point_wkt = "POINT (1 1)" ) ) @@ -52,14 +50,16 @@ test_that("scalar vector functions behave as intended", { sdf <- withColumn(sdf, "mosaic_explode", mosaic_explode(column("wkt"), lit(1L))) sdf <- withColumn(sdf, "mosaicfill", mosaicfill(column("wkt"), lit(1L))) - expect_no_error(SparkR::write.df(sdf, source = "noop", mode = "overwrite")) + expect_no_error(write.df(sdf, source = "noop", mode = "overwrite")) expect_equal(nrow(sdf), 1) }) test_that("aggregate vector functions behave as intended", { - sdf <- SparkR::sql("SELECT id as location_id FROM range(1)") + sdf <- sql("SELECT id as location_id FROM range(1)") + + inputGJ <- read_file("data/boroughs.geojson") sdf <- withColumn(sdf, "geometry", st_geomfromgeojson(lit(inputGJ))) expect_equal(nrow(sdf), 1) diff --git a/R/sparkR-mosaic/tests.R b/R/sparkR-mosaic/tests.R index 375e77a65..556b48b0f 100644 --- a/R/sparkR-mosaic/tests.R +++ b/R/sparkR-mosaic/tests.R @@ -1,4 +1,6 @@ +options(warn = -1) library(testthat) +library(readr) spark_location <- Sys.getenv("SPARK_HOME") library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib"))) @@ -12,7 +14,7 @@ install.packages(package_file, repos=NULL) library(sparkrMosaic) # find the mosaic jar in staging -staging_dir <- "/home/runner/work/mosaic/mosaic/staging/" +staging_dir <- Sys.getenv("MOSAIC_LIB_PATH", "/home/runner/work/mosaic/mosaic/staging/") mosaic_jar <- list.files(staging_dir) mosaic_jar <- mosaic_jar[grep("jar-with-dependencies.jar", mosaic_jar, fixed=T)] print("Looking for mosaic jar in") diff --git a/R/sparklyr-mosaic/enableGDAL.R b/R/sparklyr-mosaic/enableGDAL.R new file mode 100644 index 000000000..2e0001cbc --- /dev/null +++ b/R/sparklyr-mosaic/enableGDAL.R @@ -0,0 +1,17 @@ +#' enableGDAL +#' +#' @description enableGDAL activates GDAL extensions for Mosaic +#' @param sc sparkContext +#' @name enableGDAL +#' @rdname enableGDAL +#' @return None +#' @export enableGDAL +#' @examples +#' \dontrun{ +#' enableGDAL(sc)} + +enableGDAL <- function( + sc +){ + sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.gdal.MosaicGDAL", method="enableGDAL", spark_session(sc)) +} diff --git a/R/sparklyr-mosaic/sparklyrMosaic/.Rbuildignore b/R/sparklyr-mosaic/sparklyrMosaic/.Rbuildignore index 4e2484e14..04802af1f 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/.Rbuildignore +++ b/R/sparklyr-mosaic/sparklyrMosaic/.Rbuildignore @@ -1,2 +1,2 @@ ^sparkrMosaic\.Rproj$ -^\.Rproj\.user$ +^\.Rproj\.user$ \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION index 4dbd7b03d..c9d2048fb 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION +++ b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION @@ -1,7 +1,7 @@ Package: sparklyrMosaic Title: sparklyr bindings for Databricks Mosaic Version: 0.4.0 -Authors@R: +Authors@R: person("Robert", "Whiffin", , "robert.whiffin@databricks.com", role = c("aut", "cre") ) Description: This package extends sparklyr to bring the Databricks Mosaic for geospatial processing APIs into sparklyr . @@ -9,12 +9,15 @@ License: Databricks Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Collate: +Collate: + 'enableGDAL.R' 'enableMosaic.R' 'sparkFunctions.R' - 'functions.R' + 'functions.R' Imports: sparklyr -Suggests: - testthat (>= 3.0.0) -Config/testthat/edition: 3 +Suggests: + testthat (>= 3.0.0), + sparklyr.nested (>= 0.0.4), + readr (>= 2.1.5) +Config/testthat/edition: 3 \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/Blocks2020.zip b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/Blocks2020.zip new file mode 100644 index 000000000..7c367746e Binary files /dev/null and b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/Blocks2020.zip differ diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF new file mode 100644 index 000000000..2eae2a2ad Binary files /dev/null and b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF differ diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/boroughs.geojson similarity index 99% rename from R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R rename to R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/boroughs.geojson index 1ad46c114..93f5be529 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/boroughs.geojson @@ -1,4 +1,4 @@ -inputGJ = '{ +{ "type":"Feature", "properties":{ "shape_area":"0.0000607235737749", @@ -225,4 +225,4 @@ inputGJ = '{ ] ] } -}' \ No newline at end of file +} \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc new file mode 100644 index 000000000..33a03c5af Binary files /dev/null and b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc differ diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R new file mode 100644 index 000000000..3bb021c64 --- /dev/null +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R @@ -0,0 +1,181 @@ +generate_singleband_raster_df <- function() { + spark_read_source( + sc, + name = "raster", + source = "gdal", + path = "data/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF", + options = list("raster.read.strategy" = "in_memory") + ) +} + + +test_that("mosaic can read single-band GeoTiff", { + sdf <- generate_singleband_raster_df() + row <- sdf %>% head(1) %>% sdf_collect + expect_equal(row$length, 1067862L) + expect_equal(row$x_size, 2400) + expect_equal(row$y_size, 2400) + expect_equal(row$srid, 0) + expect_equal(row$bandCount, 1) + expect_equal(row$metadata[[1]]$LONGNAME, "MODIS/Terra+Aqua BRDF/Albedo Nadir BRDF-Adjusted Ref Daily L3 Global - 500m") + expect_equal(row$tile[[1]]$driver, "GTiff") + +}) + + +test_that("scalar raster functions behave as intended", { + sdf <- generate_singleband_raster_df() %>% + mutate(rst_bandmetadata = rst_bandmetadata(tile, 1L)) %>% + mutate(rst_boundingbox = rst_boundingbox(tile)) %>% + mutate(rst_boundingbox = st_buffer(rst_boundingbox, -0.001)) %>% + mutate(rst_clip = rst_clip(tile, rst_boundingbox)) %>% + mutate(rst_combineavg = rst_combineavg(array(tile, rst_clip))) %>% + mutate(rst_frombands = rst_frombands(array(tile, tile))) %>% + mutate(rst_fromfile = rst_fromfile(path, -1L)) %>% + mutate(rst_georeference = rst_georeference(tile)) %>% + mutate(rst_getnodata = rst_getnodata(tile)) %>% + mutate(rst_subdatasets = rst_subdatasets(tile)) %>% + mutate(rst_height = rst_height(tile)) %>% + mutate(rst_initnodata = rst_initnodata(tile)) %>% + mutate(rst_isempty = rst_isempty(tile)) %>% + mutate(rst_memsize = rst_memsize(tile)) %>% + mutate(rst_merge = rst_merge(array(tile, tile))) %>% + mutate(rst_metadata = rst_metadata(tile)) %>% + mutate(rst_ndvi = rst_ndvi(tile, 1L, 1L)) %>% + mutate(rst_numbands = rst_numbands(tile)) %>% + mutate(rst_pixelheight = rst_pixelheight(tile)) %>% + mutate(rst_pixelwidth = rst_pixelwidth(tile)) + + # breaking the chain here to avoid memory issues + expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) + + sdf <- generate_singleband_raster_df() %>% + mutate(rst_rastertogridavg = rst_rastertogridavg(tile, 9L)) %>% + mutate(rst_rastertogridcount = rst_rastertogridcount(tile, 9L)) %>% + mutate(rst_rastertogridmax = rst_rastertogridmax(tile, 9L)) %>% + mutate(rst_rastertogridmedian = rst_rastertogridmedian(tile, 9L)) %>% + mutate(rst_rastertogridmin = rst_rastertogridmin(tile, 9L)) %>% + mutate(rst_rastertoworldcoordx = rst_rastertoworldcoordx(tile, 1200L, 1200L)) %>% + mutate(rst_rastertoworldcoordy = rst_rastertoworldcoordy(tile, 1200L, 1200L)) %>% + mutate(rst_rastertoworldcoord = rst_rastertoworldcoord(tile, 1200L, 1200L)) %>% + mutate(rst_rotation = rst_rotation(tile)) %>% + mutate(rst_scalex = rst_scalex(tile)) %>% + mutate(rst_scaley = rst_scaley(tile)) %>% + mutate(rst_srid = rst_srid(tile)) %>% + mutate(rst_summary = rst_summary(tile)) %>% + mutate(rst_upperleftx = rst_upperleftx(tile)) %>% + mutate(rst_upperlefty = rst_upperlefty(tile)) %>% + mutate(rst_width = rst_width(tile)) %>% + mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>% + mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>% + mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) + + expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) +}) + +test_that("raster flatmap functions behave as intended", { + retiled_sdf <- generate_singleband_raster_df() %>% + mutate(rst_retile = rst_retile(tile, 1200L, 1200L)) + + expect_no_error(spark_write_source(retiled_sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(retiled_sdf), 4) + + subdivide_sdf <- generate_singleband_raster_df() %>% + mutate(rst_subdivide = rst_subdivide(tile, 1L)) + + expect_no_error(spark_write_source(subdivide_sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(subdivide_sdf), 4) + + tessellate_sdf <- generate_singleband_raster_df() %>% + mutate(rst_tessellate = rst_tessellate(tile, 3L)) + + expect_no_error(spark_write_source(tessellate_sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(tessellate_sdf), 66) + + overlap_sdf <- generate_singleband_raster_df() %>% + mutate(rst_to_overlapping_tiles = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + + expect_no_error(spark_write_source(overlap_sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(overlap_sdf), 87) + +}) + +test_that("raster aggregation functions behave as intended", { + collection_sdf <- generate_singleband_raster_df() %>% + mutate(extent = st_astext(rst_boundingbox(tile))) %>% + mutate(tile = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + + merge_sdf <- collection_sdf %>% + group_by(path) %>% + summarise(tile = rst_merge_agg(tile)) %>% + mutate(extent = st_astext(rst_boundingbox(tile))) + + expect_equal(sdf_nrow(merge_sdf), 1) + expect_equal( + collection_sdf %>% head(1) %>% collect %>% .$extent, + merge_sdf %>% head(1) %>% collect %>% .$extent + ) + + combine_avg_sdf <- collection_sdf %>% + group_by(path) %>% + summarise(tile = rst_combineavg_agg(tile)) %>% + mutate(extent = st_astext(rst_boundingbox(tile))) + + expect_equal(sdf_nrow(combine_avg_sdf), 1) + expect_equal( + collection_sdf %>% head(1) %>% collect %>% .$extent, + combine_avg_sdf %>% head(1) %>% collect %>% .$extent + ) + +}) + +test_that("the tessellate-join-clip-merge flow works on NetCDF files", { + target_resolution <- 1L + + region_keys <- c("NAME", "STATE", "BOROUGH", "BLOCK", "TRACT") + + census_sdf <- spark_read_source( + sc, + name = "census_raw", + source = "com.databricks.labs.mosaic.datasource.OGRFileFormat", + path = "data/Blocks2020.zip", + options = list( + "vsizip" = "true", + "chunkSize" = "20" + ) + ) %>% + select(region_keys, geom_0, geom_0_srid) %>% + distinct() %>% + mutate(geom_0 = st_simplify(geom_0, as.double(0.001))) %>% + mutate(geom_0 = st_updatesrid(geom_0, geom_0_srid, 4326L)) %>% + mutate(chip = grid_tessellateexplode(geom_0, target_resolution)) %>% + sdf_select(region_keys, chip$is_core, chip$index_id, chip$wkb) + + raster_sdf <- + spark_read_source( + sc, + name = "raster_raw", + source = "gdal", + path = "data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc", + options = list("raster.read.strategy" = "retile_on_read") + ) %>% + mutate(tile = rst_separatebands(tile)) %>% + sdf_register("raster") + + indexed_raster_sdf <- sdf_sql(sc, "SELECT tile, element_at(rst_metadata(tile), 'NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX') as timestep FROM raster") %>% + filter(timestep == 21L) %>% + mutate(tile = rst_setsrid(tile, 4326L)) %>% + mutate(tile = rst_to_overlapping_tiles(tile, 20L, 20L, 10L)) %>% + mutate(tile = rst_tessellate(tile, target_resolution)) + + clipped_sdf <- indexed_raster_sdf %>% + sdf_select(tile, tile.index_id, timestep, .drop_parents = FALSE) %>% + inner_join(census_sdf, by = "index_id") %>% + mutate(tile = rst_clip(tile, wkb)) + + merged_precipitation <- clipped_sdf %>% + group_by(region_keys, timestep) %>% + summarise(tile = rst_merge_agg(tile)) + + expect_equal(sdf_nrow(merged_precipitation), 1) +}) \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R index 2b0416ddd..5d2d6ba0c 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R @@ -1,62 +1,72 @@ -source("data.R") +options(warn = -1) test_that("scalar vector functions behave as intended", { - sdf <- sdf_copy_to( + sdf_raw <- sdf_copy_to( sc, data.frame( - wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", + wkt = "POLYGON ((2 1, 1 2, 2 3, 2 1))", point_wkt = "POINT (1 1)" ) ) - sdf <- mutate(sdf, "st_area" = st_area(wkt)) - sdf <- mutate(sdf, "st_length" = st_length(wkt)) - sdf <- mutate(sdf, "st_perimeter" = st_perimeter(wkt)) - sdf <- mutate(sdf, "st_buffer" = st_buffer(wkt, as.double(1.1))) - sdf <- mutate(sdf, "st_bufferloop" = st_bufferloop(wkt, as.double(1.1), as.double(1.2))) - sdf <- mutate(sdf, "st_convexhull" = st_convexhull(wkt)) - sdf <- mutate(sdf, "st_dump" = st_dump(wkt)) - sdf <- mutate(sdf, "st_translate" = st_translate(wkt, 1L, 1L)) - sdf <- mutate(sdf, "st_scale" = st_scale(wkt, 1L, 1L)) - sdf <- mutate(sdf, "st_rotate" = st_rotate(wkt, 1L)) - sdf <- mutate(sdf, "st_centroid" = st_centroid(wkt)) - sdf <- mutate(sdf, "st_numpoints" = st_numpoints(wkt)) - sdf <- mutate(sdf, "st_haversine" = st_haversine(as.double(0.0), as.double(90.0), as.double(0.0), as.double(0.0))) - sdf <- mutate(sdf, "st_isvalid" = st_isvalid(wkt)) - sdf <- mutate(sdf, "st_hasvalidcoordinates" = st_hasvalidcoordinates(wkt, "EPSG:2192", "bounds")) - sdf <- mutate(sdf, "st_intersects" = st_intersects(wkt, wkt)) - sdf <- mutate(sdf, "st_intersection" = st_intersection(wkt, wkt)) - sdf <- mutate(sdf, "st_envelope" = st_envelope(wkt)) - sdf <- mutate(sdf, "st_simplify" = st_simplify(wkt, as.double(0.001))) - sdf <- mutate(sdf, "st_difference" = st_difference(wkt, wkt)) - sdf <- mutate(sdf, "st_union" = st_union(wkt, wkt)) - sdf <- mutate(sdf, "st_unaryunion" = st_unaryunion(wkt)) - sdf <- mutate(sdf, "st_geometrytype" = st_geometrytype(wkt)) - sdf <- mutate(sdf, "st_xmin" = st_xmin(wkt)) - sdf <- mutate(sdf, "st_xmax" = st_xmax(wkt)) - sdf <- mutate(sdf, "st_ymin" = st_ymin(wkt)) - sdf <- mutate(sdf, "st_ymax" = st_ymax(wkt)) - sdf <- mutate(sdf, "st_zmin" = st_zmin(wkt)) - sdf <- mutate(sdf, "st_zmax" = st_zmax(wkt)) - sdf <- mutate(sdf, "flatten_polygons" = flatten_polygons(wkt)) + sdf <- sdf_raw %>% mutate( + st_area = st_area(wkt), + st_length = st_length(wkt), + st_perimeter = st_perimeter(wkt), + st_buffer = st_buffer(wkt, as.double(1.1)), + st_bufferloop = st_bufferloop(wkt, as.double(1.1), as.double(1.2)), + st_convexhull = st_convexhull(wkt), + st_dump = st_dump(wkt), + st_translate = st_translate(wkt, 1L, 1L), + st_scale = st_scale(wkt, 1L, 1L), + st_rotate = st_rotate(wkt, 1L), + st_centroid = st_centroid(wkt), + st_numpoints = st_numpoints(wkt), + st_haversine = st_haversine(as.double(0.0), as.double(90.0), as.double(0.0), as.double(0.0)), + st_isvalid = st_isvalid(wkt), + st_hasvalidcoordinates = st_hasvalidcoordinates(wkt, "EPSG:2192", "bounds"), + st_intersects = st_intersects(wkt, wkt), + st_intersection = st_intersection(wkt, wkt), + st_envelope = st_envelope(wkt), + st_simplify = st_simplify(wkt, as.double(0.001)), + st_difference = st_difference(wkt, wkt), + st_union = st_union(wkt, wkt), + st_unaryunion = st_unaryunion(wkt), + st_geometrytype = st_geometrytype(wkt), + st_xmin = st_xmin(wkt), + st_xmax = st_xmax(wkt), + st_ymin = st_ymin(wkt), + st_ymax = st_ymax(wkt), + st_zmin = st_zmin(wkt), + st_zmax = st_zmax(wkt) + ) + + expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(sdf), 1) # SRID functions - sdf <- mutate(sdf, "geom_with_srid" = st_setsrid(st_geomfromwkt(wkt), 4326L)) - sdf <- mutate(sdf, "srid_check" = st_srid(geom_with_srid)) - sdf <- mutate(sdf, "transformed_geom" = st_transform(geom_with_srid, 3857L)) + sdf <- sdf_raw %>% mutate( + geom_with_srid = st_setsrid(st_geomfromwkt(wkt), 4326L), + srid_check = st_srid(geom_with_srid), + transformed_geom = st_transform(geom_with_srid, 3857L) + ) + + expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(sdf), 1) # Grid functions - sdf <- mutate(sdf, "grid_longlatascellid" = grid_longlatascellid(as.double(1L), as.double(1L), 1L)) - sdf <- mutate(sdf, "grid_pointascellid" = grid_pointascellid(point_wkt, 1L)) - sdf <- mutate(sdf, "grid_boundaryaswkb" = grid_boundaryaswkb(grid_longlatascellid)) - sdf <- mutate(sdf, "grid_polyfill" = grid_polyfill(wkt, 1L)) - sdf <- mutate(sdf, "grid_tessellateexplode" = grid_tessellateexplode(wkt, 1L)) - sdf <- mutate(sdf, "grid_tessellateexplode_no_core_chips" = grid_tessellateexplode(wkt, 1L, FALSE)) - sdf <- mutate(sdf, "grid_tessellate" = grid_tessellate(wkt, 1L)) - sdf <- mutate(sdf, "grid_cellarea" = grid_cellarea(grid_longlatascellid)) + sdf <- sdf_raw %>% mutate( + grid_longlatascellid = grid_longlatascellid(as.double(1L), as.double(1L), 1L), + grid_pointascellid = grid_pointascellid(point_wkt, 1L), + grid_boundaryaswkb = grid_boundaryaswkb(grid_longlatascellid), + grid_polyfill = grid_polyfill(wkt, 1L), + grid_tessellateexplode = grid_tessellateexplode(wkt, 1L), + grid_tessellate = grid_tessellate(wkt, 1L), + grid_cellarea = grid_cellarea(grid_longlatascellid) + ) expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) expect_equal(sdf_nrow(sdf), 1) @@ -65,6 +75,7 @@ test_that("scalar vector functions behave as intended", { test_that("aggregate vector functions behave as intended", { + inputGJ <- read_file("data/boroughs.geojson") sdf <- sdf_sql(sc, "SELECT id as location_id FROM range(1)") %>% mutate(geometry = st_geomfromgeojson(inputGJ)) expect_equal(sdf_nrow(sdf), 1) @@ -90,8 +101,8 @@ test_that("aggregate vector functions behave as intended", { sdf.intersection <- sdf.l %>% inner_join(sdf.r, by = c("left_index" = "right_index"), keep = TRUE) %>% - dplyr::group_by(left_id, right_id) %>% - dplyr::summarise( + group_by(left_id, right_id) %>% + summarise( agg_intersects = st_intersects_agg(left_index, right_index), agg_intersection = st_intersection_agg(left_index, right_index), left_geom = max(left_geom, 1), diff --git a/R/sparklyr-mosaic/tests.R b/R/sparklyr-mosaic/tests.R index 17bdd882a..eef6d7c8b 100644 --- a/R/sparklyr-mosaic/tests.R +++ b/R/sparklyr-mosaic/tests.R @@ -1,21 +1,19 @@ -library(testthat) - -if(length(getOption("repos")) < 1) { - options(repos = c( - CRAN = "https://cloud.r-project.org" - )) -} +options(warn = -1) -install.packages("sparklyr", repos="") +library(testthat) +library(dplyr) +library(readr) library(sparklyr) +library(sparklyr.nested) spark_home <- Sys.getenv("SPARK_HOME") spark_home_set(spark_home) + install.packages("sparklyrMosaic_0.4.0.tar.gz", repos = NULL) library(sparklyrMosaic) # find the mosaic jar in staging -staging_dir <- "/home/runner/work/mosaic/mosaic/staging/" +staging_dir <- Sys.getenv("MOSAIC_LIB_PATH", "/home/runner/work/mosaic/mosaic/staging/") mosaic_jar <- list.files(staging_dir) mosaic_jar <- mosaic_jar[grep("jar-with-dependencies.jar", mosaic_jar, fixed=T)] mosaic_jar_path <- paste0(staging_dir, mosaic_jar) @@ -26,5 +24,6 @@ config$`sparklyr.jars.default` <- c(mosaic_jar_path) sc <- spark_connect(master="local[*]", config=config) enableMosaic(sc) +enableGDAL(sc) testthat::test_local(path="./sparklyrMosaic") \ No newline at end of file diff --git a/python/mosaic/api/aggregators.py b/python/mosaic/api/aggregators.py index d68275b73..291d7b534 100644 --- a/python/mosaic/api/aggregators.py +++ b/python/mosaic/api/aggregators.py @@ -68,6 +68,29 @@ def st_intersects_agg(leftIndex: ColumnOrName, rightIndex: ColumnOrName) -> Colu ) +def st_intersects_agg(leftIndex: ColumnOrName, rightIndex: ColumnOrName) -> Column: + """ + Tests if any `leftIndex` : `rightIndex` pairs intersect. + + Parameters + ---------- + leftIndex : Column + The index field of the left-hand geometry + rightIndex : Column + The index field of the right-hand geometry + + Returns + ------- + Column (BooleanType) + + """ + return config.mosaic_context.invoke_function( + "st_intersects_agg", + pyspark_to_java_column(leftIndex), + pyspark_to_java_column(rightIndex), + ) + + def st_union_agg(geom: ColumnOrName) -> Column: """ Returns the point set union of the aggregated geometries. @@ -161,7 +184,9 @@ def rst_combineavg_agg(raster_tile: ColumnOrName) -> Column: ) -def rst_derivedband_agg(raster_tile: ColumnOrName, python_func: ColumnOrName, func_name: ColumnOrName) -> Column: +def rst_derivedband_agg( + raster_tile: ColumnOrName, python_func: ColumnOrName, func_name: ColumnOrName +) -> Column: """ Returns the raster tile representing the aggregation of rasters using provided python function. @@ -182,5 +207,8 @@ def rst_derivedband_agg(raster_tile: ColumnOrName, python_func: ColumnOrName, fu """ return config.mosaic_context.invoke_function( - "rst_derivedband_agg", pyspark_to_java_column(raster_tile), pyspark_to_java_column(python_func), pyspark_to_java_column(func_name) + "rst_derivedband_agg", + pyspark_to_java_column(raster_tile), + pyspark_to_java_column(python_func), + pyspark_to_java_column(func_name), ) diff --git a/python/mosaic/api/enable.py b/python/mosaic/api/enable.py index 9cbc52136..5a96d226e 100644 --- a/python/mosaic/api/enable.py +++ b/python/mosaic/api/enable.py @@ -11,8 +11,11 @@ def enable_mosaic( - spark: SparkSession, dbutils = None, log_info: bool = False, - jar_path: str = None, jar_autoattach: bool = True + spark: SparkSession, + dbutils=None, + log_info: bool = False, + jar_path: str = None, + jar_autoattach: bool = True, ) -> None: """ Enable Mosaic functions. @@ -30,20 +33,20 @@ def enable_mosaic( Logging cannot be adjusted with Unity Catalog Shared Access clusters; if you try to do so, will throw a Py4JSecurityException. - True will try to setLogLevel to 'info' - - False will not; Default is False + - False will not; Default is False jar_path : str Convenience when you need to change the JAR path for Unity Catalog Volumes with Shared Access clusters - - Default is None; if provided, sets - "spark.databricks.labs.mosaic.jar.path" + - Default is None; if provided, sets + "spark.databricks.labs.mosaic.jar.path" jar_autoattach : bool Convenience when you need to turn off JAR auto-attach for Unity - Catalog Volumes with Shared Access clusters. + Catalog Volumes with Shared Access clusters. - False will not registers the JAR; sets "spark.databricks.labs.mosaic.jar.autoattach" to "false" - True will register the JAR; Default is True - + Returns ------- @@ -62,7 +65,7 @@ def enable_mosaic( Explicitly specify the index system to use for optimized spatial joins. (Optional) """ - # Set spark session, conditionally: + # Set spark session, conditionally: # - set conf for jar autoattach # - set conf for jar path # - set log level to 'info' @@ -73,9 +76,9 @@ def enable_mosaic( spark.conf.set("spark.databricks.labs.mosaic.jar.path", jar_path) print(f"...set 'spark.databricks.labs.mosaic.jar.path' to '{jar_path}'") if log_info: - spark.sparkContext.setLogLevel('info') + spark.sparkContext.setLogLevel("info") config.mosaic_spark = spark - _ = MosaicLibraryHandler(config.mosaic_spark, log_info = log_info) + _ = MosaicLibraryHandler(config.mosaic_spark, log_info=log_info) config.mosaic_context = MosaicContext(config.mosaic_spark) # Register SQL functions diff --git a/python/mosaic/api/functions.py b/python/mosaic/api/functions.py index e2165195b..c9426b31c 100644 --- a/python/mosaic/api/functions.py +++ b/python/mosaic/api/functions.py @@ -156,7 +156,9 @@ def st_convexhull(geom: ColumnOrName) -> Column: ) -def st_concavehull(geom: ColumnOrName, concavity: ColumnOrName, has_holes: Any = False) -> Column: +def st_concavehull( + geom: ColumnOrName, concavity: ColumnOrName, has_holes: Any = False +) -> Column: """ Compute the concave hull of a geometry or multi-geometry object. It uses lengthRatio and @@ -191,7 +193,7 @@ def st_concavehull(geom: ColumnOrName, concavity: ColumnOrName, has_holes: Any = "st_concavehull", pyspark_to_java_column(geom), pyspark_to_java_column(concavity), - pyspark_to_java_column(has_holes) + pyspark_to_java_column(has_holes), ) @@ -218,7 +220,7 @@ def st_buffer(geom: ColumnOrName, radius: ColumnOrName) -> Column: def st_bufferloop( - geom: ColumnOrName, inner_radius: ColumnOrName, outer_radius: ColumnOrName + geom: ColumnOrName, inner_radius: ColumnOrName, outer_radius: ColumnOrName ) -> Column: """ Compute the buffered geometry loop (hollow ring) based on geom and provided radius-es. @@ -364,7 +366,7 @@ def st_transform(geom: ColumnOrName, srid: ColumnOrName) -> Column: def st_hasvalidcoordinates( - geom: ColumnOrName, crs: ColumnOrName, which: ColumnOrName + geom: ColumnOrName, crs: ColumnOrName, which: ColumnOrName ) -> Column: """ Checks if all points in geometry are valid with respect to crs bounds. @@ -571,7 +573,7 @@ def st_distance(geom1: ColumnOrName, geom2: ColumnOrName) -> Column: def st_haversine( - lat1: ColumnOrName, lng1: ColumnOrName, lat2: ColumnOrName, lng2: ColumnOrName + lat1: ColumnOrName, lng1: ColumnOrName, lat2: ColumnOrName, lng2: ColumnOrName ) -> Column: """ Compute the haversine distance in kilometers between two latitude/longitude pairs. @@ -723,7 +725,7 @@ def st_unaryunion(geom: ColumnOrName) -> Column: def st_updatesrid( - geom: ColumnOrName, srcSRID: ColumnOrName, destSRID: ColumnOrName + geom: ColumnOrName, srcSRID: ColumnOrName, destSRID: ColumnOrName ) -> Column: """ Updates the SRID of the input geometry `geom` from `srcSRID` to `destSRID`. @@ -1008,7 +1010,7 @@ def grid_boundary(index_id: ColumnOrName, format_name: ColumnOrName) -> Column: def grid_longlatascellid( - lon: ColumnOrName, lat: ColumnOrName, resolution: ColumnOrName + lon: ColumnOrName, lat: ColumnOrName, resolution: ColumnOrName ) -> Column: """ Returns the grid's cell ID associated with the input `lng` and `lat` coordinates at a given grid `resolution`. @@ -1076,7 +1078,7 @@ def grid_polyfill(geom: ColumnOrName, resolution: ColumnOrName) -> Column: def grid_tessellate( - geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True + geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True ) -> Column: """ Generates: @@ -1111,7 +1113,7 @@ def grid_tessellate( def grid_tessellateexplode( - geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True + geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True ) -> Column: """ Generates: @@ -1271,7 +1273,7 @@ def grid_cellkloopexplode(cellid: ColumnOrName, k: ColumnOrName) -> Column: def grid_geometrykring( - geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName + geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName ) -> Column: """ Returns the k-ring of cells around the input geometry. @@ -1296,7 +1298,7 @@ def grid_geometrykring( def grid_geometrykloop( - geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName + geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName ) -> Column: """ Returns the k loop (hollow ring) of cells around the input geometry. @@ -1321,7 +1323,7 @@ def grid_geometrykloop( def grid_geometrykringexplode( - geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName + geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName ) -> Column: """ Returns the exploded k-ring of cells around the input geometry. @@ -1346,7 +1348,7 @@ def grid_geometrykringexplode( def grid_geometrykloopexplode( - geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName + geom: ColumnOrName, resolution: ColumnOrName, k: ColumnOrName ) -> Column: """ Returns the exploded k loop (hollow ring) of cells around the input geometry. @@ -1393,7 +1395,7 @@ def point_index_geom(geom: ColumnOrName, resolution: ColumnOrName) -> Column: def point_index_lonlat( - lon: ColumnOrName, lat: ColumnOrName, resolution: ColumnOrName + lon: ColumnOrName, lat: ColumnOrName, resolution: ColumnOrName ) -> Column: """ [Deprecated] alias for `grid_longlatascellid` @@ -1450,7 +1452,7 @@ def polyfill(geom: ColumnOrName, resolution: ColumnOrName) -> Column: def mosaic_explode( - geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True + geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True ) -> Column: """ [Deprecated] alias for `grid_tessellateexplode` @@ -1485,7 +1487,7 @@ def mosaic_explode( def mosaicfill( - geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True + geom: ColumnOrName, resolution: ColumnOrName, keep_core_geometries: Any = True ) -> Column: """ [Deprecated] alias for `grid_tessellate` diff --git a/python/mosaic/api/fuse.py b/python/mosaic/api/fuse.py index 8f0cb3372..ac1966cd1 100644 --- a/python/mosaic/api/fuse.py +++ b/python/mosaic/api/fuse.py @@ -6,6 +6,7 @@ __all__ = ["SetupMgr", "setup_fuse_install"] + def get_install_mosaic_version() -> str: """ Currently installed version of mosaic. @@ -21,14 +22,19 @@ def get_install_mosaic_version() -> str: pass return None + @dataclass class SetupMgr: + """ + Defaults mirror setup_gdal. + """ + to_fuse_dir: str - script_in_name: str = 'mosaic-gdal-init.sh' - script_out_name: str = 'mosaic-fuse-init.sh' + script_in_name: str = "mosaic-gdal-init.sh" + script_out_name: str = "mosaic-gdal-init.sh" with_mosaic_pip: bool = False with_gdal: bool = True - with_ubuntugis: bool = False + with_ubuntugis: bool = False override_mosaic_version: str = None jar_copy: bool = False jni_so_copy: bool = False @@ -42,188 +48,228 @@ def configure(self) -> bool: """ # - set the mosaic and github versions # will be used in downloading resources - # may be used in pip install + # may be used in pip install mosaic_version = get_install_mosaic_version() - github_version = mosaic_version # <- valid or None - pip_str = '' + github_version = mosaic_version # <- valid or None + if self.override_mosaic_version is not None and set( + self.override_mosaic_version + ) <= set("=0123456789."): + github_version = self.override_mosaic_version.replace("=", "") + github_version = mosaic_version # <- valid or None + pip_str = "" release_version = None if ( - self.override_mosaic_version is not None and - self.override_mosaic_version == 'main' - ): - github_version = 'main' - elif ( - self.override_mosaic_version is not None and - set(self.override_mosaic_version).issubset(set('=0123456789.')) + self.override_mosaic_version is not None + and self.override_mosaic_version == "main" ): - github_version = self.override_mosaic_version.replace('=','') + github_version = "main" + elif self.override_mosaic_version is not None and set( + self.override_mosaic_version + ).issubset(set("=0123456789.")): + github_version = self.override_mosaic_version.replace("=", "") elif mosaic_version is None: - github_version = 'main' + github_version = "main" - GITHUB_CONTENT_URL_BASE = 'https://raw.githubusercontent.com/databrickslabs/mosaic' - GITHUB_CONTENT_TAG_URL = f'{GITHUB_CONTENT_URL_BASE}/v_{github_version}' - if github_version == 'main': - GITHUB_CONTENT_TAG_URL = f'{GITHUB_CONTENT_URL_BASE}/main' + GITHUB_CONTENT_URL_BASE = ( + "https://raw.githubusercontent.com/databrickslabs/mosaic" + ) + GITHUB_CONTENT_TAG_URL = f"{GITHUB_CONTENT_URL_BASE}/v_{github_version}" + if github_version == "main": + GITHUB_CONTENT_TAG_URL = f"{GITHUB_CONTENT_URL_BASE}/main" # - generate fuse dir path os.makedirs(self.to_fuse_dir, exist_ok=True) with_script = self.with_mosaic_pip or self.with_gdal - script_out_path = f'{self.to_fuse_dir}/{self.script_out_name}' + script_out_path = f"{self.to_fuse_dir}/{self.script_out_name}" if with_script: # - start with the unconfigured script - script_url = f'{GITHUB_CONTENT_TAG_URL}/scripts/{self.script_in_name}' + script_url = f"{GITHUB_CONTENT_TAG_URL}/scripts/{self.script_in_name}" script = None with requests.Session() as s: script = s.get(script_url, allow_redirects=True).text - + # - tokens used in script - SCRIPT_FUSE_DIR_TOKEN= "FUSE_DIR='__FUSE_DIR__'" # <- ' added - SCRIPT_GITHUB_VERSION_TOKEN = 'GITHUB_VERSION=__GITHUB_VERSION__' - SCRIPT_MOSAIC_PIP_VERSION_TOKEN = "MOSAIC_PIP_VERSION='__MOSAIC_PIP_VERSION__'" # <- ' added - SCRIPT_WITH_MOSAIC_TOKEN = 'WITH_MOSAIC=0' - SCRIPT_WITH_GDAL_TOKEN = 'WITH_GDAL=0' - SCRIPT_WITH_UBUNTUGIS_TOKEN ='WITH_UBUNTUGIS=0' - SCRIPT_WITH_FUSE_SO_TOKEN = 'WITH_FUSE_SO=0' + SCRIPT_FUSE_DIR_TOKEN = "FUSE_DIR='__FUSE_DIR__'" # <- ' added + SCRIPT_GITHUB_VERSION_TOKEN = "GITHUB_VERSION=__GITHUB_VERSION__" + SCRIPT_MOSAIC_PIP_VERSION_TOKEN = ( + "MOSAIC_PIP_VERSION='__MOSAIC_PIP_VERSION__'" # <- ' added + ) + SCRIPT_WITH_MOSAIC_TOKEN = "WITH_MOSAIC=0" + SCRIPT_WITH_GDAL_TOKEN = "WITH_GDAL=0" + SCRIPT_WITH_UBUNTUGIS_TOKEN = "WITH_UBUNTUGIS=0" + SCRIPT_WITH_FUSE_SO_TOKEN = "WITH_FUSE_SO=0" # - set the github version in the script # this will be used to download so files script = script.replace( - SCRIPT_GITHUB_VERSION_TOKEN, SCRIPT_GITHUB_VERSION_TOKEN.replace( - '__GITHUB_VERSION__', github_version) - ) + SCRIPT_GITHUB_VERSION_TOKEN, + SCRIPT_GITHUB_VERSION_TOKEN.replace( + "__GITHUB_VERSION__", github_version + ), + ) # - set the fuse dir script = script.replace( - SCRIPT_FUSE_DIR_TOKEN, SCRIPT_FUSE_DIR_TOKEN.replace('__FUSE_DIR__', self.to_fuse_dir) + SCRIPT_FUSE_DIR_TOKEN, + SCRIPT_FUSE_DIR_TOKEN.replace("__FUSE_DIR__", self.to_fuse_dir), ) - + + script = script.replace("apt-add-repository", "apt-add-repository -y") + # - are we configuring for mosaic pip? if self.with_mosaic_pip: script = script.replace( - SCRIPT_WITH_MOSAIC_TOKEN, SCRIPT_WITH_MOSAIC_TOKEN.replace('0','1') + SCRIPT_WITH_MOSAIC_TOKEN, SCRIPT_WITH_MOSAIC_TOKEN.replace("0", "1") ) - + # - are we configuring for gdal? if self.with_gdal: script = script.replace( - SCRIPT_WITH_GDAL_TOKEN, SCRIPT_WITH_GDAL_TOKEN.replace('0','1') + SCRIPT_WITH_GDAL_TOKEN, SCRIPT_WITH_GDAL_TOKEN.replace("0", "1") ) - + # - are we configuring for ubuntugis? if self.with_ubuntugis: script = script.replace( - SCRIPT_WITH_UBUNTUGIS_TOKEN, SCRIPT_WITH_UBUNTUGIS_TOKEN.replace('0','1') + SCRIPT_WITH_UBUNTUGIS_TOKEN, + SCRIPT_WITH_UBUNTUGIS_TOKEN.replace("0", "1"), ) # - are we configuring for jni so copy? if self.jni_so_copy: script = script.replace( - SCRIPT_WITH_FUSE_SO_TOKEN, SCRIPT_WITH_FUSE_SO_TOKEN.replace('0','1') + SCRIPT_WITH_FUSE_SO_TOKEN, + SCRIPT_WITH_FUSE_SO_TOKEN.replace("0", "1"), ) - + # - set the mosaic version for pip if ( - self.override_mosaic_version is not None and - not self.override_mosaic_version == 'main' + self.override_mosaic_version is not None + and not self.override_mosaic_version == "main" ): - pip_str = f'=={self.override_mosaic_version}' - if any(c in self.override_mosaic_version for c in ['=','>','<']): + pip_str = f"=={self.override_mosaic_version}" + if any(c in self.override_mosaic_version for c in ["=", ">", "<"]): pip_str = f"""{self.override_mosaic_version.replace("'","").replace('"','')}""" else: pip_str = f"=={self.override_mosaic_version}" elif mosaic_version is not None: pip_str = f"=={mosaic_version}" script = script.replace( - SCRIPT_MOSAIC_PIP_VERSION_TOKEN, + SCRIPT_MOSAIC_PIP_VERSION_TOKEN, SCRIPT_MOSAIC_PIP_VERSION_TOKEN.replace( - "__MOSAIC_PIP_VERSION__", pip_str) + "__MOSAIC_PIP_VERSION__", pip_str + ), ) - + # - write the configured init script - with open(script_out_path, 'w') as file: + with open(script_out_path, "w") as file: file.write(script) - + # --- end of script config --- with_resources = self.jar_copy or self.jni_so_copy resource_statuses = {} - if with_resources: - CHUNK_SIZE = 1024 * 1024 * 64 # 64MB + if with_resources: + CHUNK_SIZE = 1024 * 1024 * 64 # 64MB # - handle jar copy if self.jar_copy: # url and version details - GITHUB_RELEASE_URL_BASE = 'https://github.com/databrickslabs/mosaic/releases' + GITHUB_RELEASE_URL_BASE = ( + "https://github.com/databrickslabs/mosaic/releases" + ) resource_version = github_version - if github_version == 'main': + if github_version == "main": latest = None with requests.Session() as s: - latest = str(s.get(f'{GITHUB_RELEASE_URL_BASE}/latest', allow_redirects=True).content) + latest = str( + s.get( + f"{GITHUB_RELEASE_URL_BASE}/latest", + allow_redirects=True, + ).content + ) resource_version = latest.split("/tag/v_")[1].split('"')[0] - # download jar - jar_filename = f'mosaic-{resource_version}-jar-with-dependencies.jar' - jar_path = f'{self.to_fuse_dir}/{jar_filename}' + # download jar + jar_filename = f"mosaic-{resource_version}-jar-with-dependencies.jar" + jar_path = f"{self.to_fuse_dir}/{jar_filename}" with requests.Session() as s: r = s.get( - f'{GITHUB_RELEASE_URL_BASE}/download/v_{resource_version}/{jar_filename}', - stream=True - ) - with open(jar_path, 'wb') as f: - for ch in r.iter_content(chunk_size=CHUNK_SIZE): + f"{GITHUB_RELEASE_URL_BASE}/download/v_{resource_version}/{jar_filename}", + stream=True, + ) + with open(jar_path, "wb") as f: + for ch in r.iter_content(chunk_size=CHUNK_SIZE): f.write(ch) resource_statuses[jar_filename] = r.status_code - # - handle so copy + # - handle so copy if self.jni_so_copy: with requests.Session() as s: - for so_filename in ['libgdalalljni.so', 'libgdalalljni.so.30', 'libgdalalljni.so.30.0.3']: - so_path = f'{self.to_fuse_dir}/{so_filename}' + for so_filename in [ + "libgdalalljni.so", + "libgdalalljni.so.30", + "libgdalalljni.so.30.0.3", + ]: + so_path = f"{self.to_fuse_dir}/{so_filename}" r = s.get( - f'{GITHUB_CONTENT_TAG_URL}/resources/gdal/jammy/{so_filename}', - stream=True + f"{GITHUB_CONTENT_TAG_URL}/resources/gdal/jammy/{so_filename}", + stream=True, ) - with open(so_path, 'wb') as f: - for ch in r.iter_content(chunk_size=CHUNK_SIZE): + with open(so_path, "wb") as f: + for ch in r.iter_content(chunk_size=CHUNK_SIZE): f.write(ch) resource_statuses[so_filename] = r.status_code - + # - echo status print(f"::: Install setup complete :::") - print(f"- Settings: 'with_mosaic_pip'? {self.with_mosaic_pip}, 'with_gdal'? {self.with_gdal}, 'with_ubuntugis'? {self.with_ubuntugis}") - print(f" 'jar_copy'? {self.jar_copy}, 'jni_so_copy'? {self.jni_so_copy}, 'override_mosaic_version'? {self.override_mosaic_version}") - print(f"- Derived: 'mosaic_version'? {mosaic_version}, 'github_version'? {github_version}, 'release_version'? {release_version}, 'pip_str'? {pip_str}") + print( + f"- Settings: 'with_mosaic_pip'? {self.with_mosaic_pip}, 'with_gdal'? {self.with_gdal}, 'with_ubuntugis'? {self.with_ubuntugis}" + ) + print( + f" 'override_mosaic_version'? {self.override_mosaic_version}, 'jar_copy'? {self.jar_copy}, 'jni_so_copy'? {self.jni_so_copy}" + ) print(f"- Fuse Dir: '{self.to_fuse_dir}'") if with_script: - print(f"- Init Script: configured and stored at '{self.script_out_name}'; ", end='') - print(f"add to your cluster and restart,") - print(f" more at https://docs.databricks.com/en/init-scripts/cluster-scoped.html") + print( + f"- Init Script: configured and stored at '{self.script_out_name}'; ", + end="", + ) + print(f"add to your cluster and restart,") + print( + f" more at https://docs.databricks.com/en/init-scripts/cluster-scoped.html" + ) if with_resources: - print(f"- Resource(s): copied") - print(resource_statuses) + print(f"- Resource(s): copied") + print(resource_statuses) print("\n") - if ( - not any(resource_statuses) or - all(value == 200 for value in resource_statuses.values()) + if not any(resource_statuses) or all( + value == 200 for value in resource_statuses.values() ): return True else: return False - + + def setup_fuse_install( - to_fuse_dir: str, with_mosaic_pip: bool, with_gdal: bool, - with_ubuntugis: bool = False, script_out_name: str = 'mosaic-fuse-init.sh', - override_mosaic_version: str = None, jar_copy: bool = True, jni_so_copy: bool = True -) -> bool: + to_fuse_dir: str, + with_mosaic_pip: bool, + with_gdal: bool, + with_ubuntugis: bool = False, + script_out_name: str = "mosaic-fuse-init.sh", + override_mosaic_version: str = None, + jar_copy: bool = True, + jni_so_copy: bool = True, +) -> None: """ [1] Copies Mosaic "fat" JAR (with dependencies) into `to_fuse_dir` - by default, version will match the current mosaic version executing the command, - assuming it is a released version; if `override_mosaic_version` is a single value, + assuming it is a released version; if `override_mosaic_version` is a single value, versus a range, that value will be used instead - this doesn't involve a script unless `with_mosaic_pip=True` or `with_gdal=True` - if `jar_copy=False`, then the JAR is not copied [2] if `with_mosaic_pip=True` - - By default, configures script to pip install databricks-mosaic using current mosaic + - By default, configures script to pip install databricks-mosaic using current mosaic version executing the command or to `override_mosaic_version` - this is useful (1) to "pin" to a specific mosaic version, especially if using the JAR that is also being pre-staged for this version and (2) to consolidate all mosaic @@ -231,22 +277,22 @@ def setup_fuse_install( [3] if `with_gdal=True` - configures script that is a variation of what setup_gdal does with some differences - configures to load shared objects from fuse dir (vs wget) - [4] if `with_ubuntugis=True` (assumes `with_gdal=True`) + [4] if `with_ubuntugis=True` (assumes `with_gdal=True`) - configures script to use the GDAL version provided by ubuntugis - default is False Notes: (a) `to_fuse_dir` can be one of `/Volumes/..`, `/Workspace/..`, `/dbfs/..` - (b) Volume paths are the recommended FUSE mount for Databricks in DBR 13.3+ + (b) Volume paths are the recommended FUSE mount for Databricks in DBR 13.3+ (c) If using Volumes, there are more admin actions that a Unity Catalog admin - needs to be take to add the generated script and JAR to the Unity Catalog - allowlist, essential steps for Shared Cluster and Java access! + needs to be take to add the generated script and JAR to the Unity Catalog + allowlist, essential steps for Shared Cluster and Java access! Parameters ---------- to_fuse_dir : str Path to write out the resource(s) for GDAL installation. with_mosaic_pip : bool - Whether to configure a script that pip installs databricks-mosaic, + Whether to configure a script that pip installs databricks-mosaic, fixed to the current version. with_gdal : bool Whether to also configure a script for GDAL and pre-stage GDAL JNI shared object files. @@ -271,12 +317,12 @@ def setup_fuse_install( """ setup_mgr = SetupMgr( to_fuse_dir, - with_mosaic_pip = with_mosaic_pip, - with_gdal = with_gdal, - with_ubuntugis = with_ubuntugis, - script_out_name = script_out_name, - override_mosaic_version = override_mosaic_version, - jar_copy = jar_copy, - jni_so_copy = jni_so_copy + with_mosaic_pip=with_mosaic_pip, + with_gdal=with_gdal, + with_ubuntugis=with_ubuntugis, + script_out_name=script_out_name, + override_mosaic_version=override_mosaic_version, + jar_copy=jar_copy, + jni_so_copy=jni_so_copy, ) return setup_mgr.configure() diff --git a/python/mosaic/api/gdal.py b/python/mosaic/api/gdal.py index 9a44c446b..386070400 100644 --- a/python/mosaic/api/gdal.py +++ b/python/mosaic/api/gdal.py @@ -5,32 +5,34 @@ __all__ = ["setup_gdal", "enable_gdal"] - + def setup_gdal( - to_fuse_dir: str = '/Workspace/Shared/geospatial/mosaic/gdal/jammy', - with_mosaic_pip: bool = False, with_ubuntugis: bool = False, - script_out_name: str = 'mosaic-gdal-init.sh', - override_mosaic_version: str = None + to_fuse_dir: str = "/Workspace/Shared/geospatial/mosaic/gdal/jammy", + with_mosaic_pip: bool = False, + with_ubuntugis: bool = False, + script_out_name: str = "mosaic-gdal-init.sh", + override_mosaic_version: str = None, + jni_so_copy: bool = False, ) -> bool: """ Prepare GDAL init script and shared objects required for GDAL to run on spark. This function will generate the init script that will install GDAL on each worker node. After the setup_gdal is run, the init script must be added to the cluster; also, - a cluster restart is required. - + a cluster restart is required. + Notes: (a) This is close in behavior to Mosaic < 0.4 series (prior to DBR 13), with new options to pip install Mosaic for either ubuntugis gdal (3.4.3) or jammy default (3.4.1) (b) `to_fuse_dir` can be one of `/Volumes/..`, `/Workspace/..`, `/dbfs/..`; however, you should use `setup_fuse_install()` for Volume based installs - + Parameters ---------- to_fuse_dir : str Path to write out the init script for GDAL installation; default is '/Workspace/Shared/geospatial/mosaic/gdal/jammy'. with_mosaic_pip : bool - Whether to configure a script that pip installs databricks-mosaic, + Whether to configure a script that pip installs databricks-mosaic, fixed to the current version; default is False. with_ubuntugis : bool Whether to use ubuntugis ppa for GDAL instead of built-in; @@ -45,20 +47,21 @@ def setup_gdal( Returns True unless resources fail to download. ------- - """ + """ setup_mgr = SetupMgr( to_fuse_dir, - with_mosaic_pip = with_mosaic_pip, - with_ubuntugis = with_ubuntugis, - script_out_name = script_out_name, - override_mosaic_version = override_mosaic_version, + with_mosaic_pip=with_mosaic_pip, + with_ubuntugis=with_ubuntugis, + script_out_name=script_out_name, + override_mosaic_version=override_mosaic_version, + jni_so_copy=jni_so_copy, ) return setup_mgr.configure() def enable_gdal(spark: SparkSession) -> None: """ - Enable GDAL at runtime on a cluster with GDAL installed using init script, + Enable GDAL at runtime on a cluster with GDAL installed using init script, e.g. generated by setup_gdal() or setup_fuse_install() call. Parameters diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index befd10cb9..3638510dc 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -45,6 +45,8 @@ "rst_rotation", "rst_scalex", "rst_scaley", + "rst_separatebands", + "rst_setsrid", "rst_setnodata", "rst_skewx", "rst_skewy", @@ -82,7 +84,9 @@ def rst_bandmetadata(raster_tile: ColumnOrName, band: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_bandmetadata", pyspark_to_java_column(raster_tile), pyspark_to_java_column(band) + "rst_bandmetadata", + pyspark_to_java_column(raster_tile), + pyspark_to_java_column(band), ) @@ -126,7 +130,9 @@ def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_clip", pyspark_to_java_column(raster_tile), pyspark_to_java_column(geometry) + "rst_clip", + pyspark_to_java_column(raster_tile), + pyspark_to_java_column(geometry), ) @@ -150,7 +156,9 @@ def rst_combineavg(raster_tiles: ColumnOrName) -> Column: ) -def rst_derivedband(raster_tile: ColumnOrName, python_func: ColumnOrName, func_name: ColumnOrName) -> Column: +def rst_derivedband( + raster_tile: ColumnOrName, python_func: ColumnOrName, func_name: ColumnOrName +) -> Column: """ Creates a new band by applying the given python function to the input rasters. The result is a raster tile. @@ -425,7 +433,9 @@ def rst_numbands(raster_tile: ColumnOrName) -> Column: ) -def rst_ndvi(raster_tile: ColumnOrName, band1: ColumnOrName, band2: ColumnOrName) -> Column: +def rst_ndvi( + raster_tile: ColumnOrName, band1: ColumnOrName, band2: ColumnOrName +) -> Column: """ Computes the NDVI of the raster. The result is Mosaic raster tile struct of the NDVI raster. @@ -515,7 +525,9 @@ def rst_rastertogridavg(raster_tile: ColumnOrName, resolution: ColumnOrName) -> ) -def rst_rastertogridcount(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Column: +def rst_rastertogridcount( + raster_tile: ColumnOrName, resolution: ColumnOrName +) -> Column: """ The result is a 2D array of cells, where each cell is a struct of (cellID, value). For getting the output of cellID->value pairs, please use explode() function twice. @@ -565,7 +577,9 @@ def rst_rastertogridmax(raster_tile: ColumnOrName, resolution: ColumnOrName) -> ) -def rst_rastertogridmedian(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Column: +def rst_rastertogridmedian( + raster_tile: ColumnOrName, resolution: ColumnOrName +) -> Column: """ The result is a 2D array of cells, where each cell is a struct of (cellID, value). For getting the output of cellID->value pairs, please use explode() function twice. @@ -784,6 +798,28 @@ def rst_scaley(raster_tile: ColumnOrName) -> Column: ) +def rst_separatebands(raster_tile: ColumnOrName) -> Column: + """ + Returns a set of new single-band rasters, one for each band in the input raster. + Result set is automatically exploded. + + Parameters + ---------- + raster_tile : Column (RasterTileType) + Mosaic raster tile struct column. + + Returns + ------- + Column (MosaicTile) + The single-band raster tiles, exploded. + + """ + return config.mosaic_context.invoke_function( + "rst_separatebands", + pyspark_to_java_column(raster_tile), + ) + + def rst_setnodata(raster_tile: ColumnOrName, nodata: ColumnOrName) -> Column: """ Sets the nodata value of the band. @@ -848,6 +884,28 @@ def rst_skewy(raster_tile: ColumnOrName) -> Column: ) +def rst_setsrid(raster_tile: ColumnOrName, srid: ColumnOrName) -> Column: + """ + Sets the SRID of the raster. + The SRID is the EPSG code of the raster. + + Parameters + ---------- + raster_tile : Column (RasterTileType) + Mosaic raster tile struct column. + srid : Column (IntegerType) + EPSG authority code for the file's projection. + Returns + ------- + Column (MosaicRasterTile) + The updated raster. + + """ + return config.mosaic_context.invoke_function( + "rst_setsrid", pyspark_to_java_column(raster_tile), pyspark_to_java_column(srid) + ) + + def rst_srid(raster_tile: ColumnOrName) -> Column: """ Computes the SRID of the raster. @@ -939,7 +997,9 @@ def rst_tessellate(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Colum ) -def rst_fromcontent(raster_bin: ColumnOrName, driver: ColumnOrName, size_in_mb: Any = -1) -> Column: +def rst_fromcontent( + raster_bin: ColumnOrName, driver: ColumnOrName, size_in_mb: Any = -1 +) -> Column: """ Tiles the raster binary into tiles of the given size. :param raster_bin: @@ -948,13 +1008,13 @@ def rst_fromcontent(raster_bin: ColumnOrName, driver: ColumnOrName, size_in_mb: :return: """ if type(size_in_mb) == int: - size_in_mb = lit(size_in_mb) + size_in_mb = lit(size_in_mb) return config.mosaic_context.invoke_function( "rst_fromcontent", pyspark_to_java_column(raster_bin), pyspark_to_java_column(driver), - pyspark_to_java_column(size_in_mb) + pyspark_to_java_column(size_in_mb), ) @@ -966,10 +1026,12 @@ def rst_fromfile(raster_path: ColumnOrName, size_in_mb: Any = -1) -> Column: :return: """ if type(size_in_mb) == int: - size_in_mb = lit(size_in_mb) - + size_in_mb = lit(size_in_mb) + return config.mosaic_context.invoke_function( - "rst_fromfile", pyspark_to_java_column(raster_path), pyspark_to_java_column(size_in_mb) + "rst_fromfile", + pyspark_to_java_column(raster_path), + pyspark_to_java_column(size_in_mb), ) diff --git a/python/mosaic/core/__init__.py b/python/mosaic/core/__init__.py index 683fb00cd..7d90bfb91 100644 --- a/python/mosaic/core/__init__.py +++ b/python/mosaic/core/__init__.py @@ -1,2 +1,2 @@ from .library_handler import MosaicLibraryHandler -from .mosaic_context import MosaicContext \ No newline at end of file +from .mosaic_context import MosaicContext diff --git a/python/mosaic/core/library_handler.py b/python/mosaic/core/library_handler.py index 6568fd042..8b4d81ddf 100644 --- a/python/mosaic/core/library_handler.py +++ b/python/mosaic/core/library_handler.py @@ -11,7 +11,7 @@ class MosaicLibraryHandler: _jar_path = None _jar_filename = None _auto_attached_enabled = None - + def __init__(self, spark, log_info: bool = True): self.spark = spark self.sc = spark.sparkContext @@ -21,7 +21,7 @@ def __init__(self, spark, log_info: bool = True): LOGGER = log4jLogger.LogManager.getLogger(__class__.__name__) if self.auto_attach_enabled: - jar_path = self.mosaic_library_location + jar_path = self.mosaic_library_location LOGGER and LOGGER.info(f"Looking for Mosaic JAR at {jar_path}.") if not os.path.exists(jar_path): raise FileNotFoundError( @@ -36,7 +36,7 @@ def auto_attach_enabled(self) -> bool: try: result = ( self.spark.conf.get("spark.databricks.labs.mosaic.jar.autoattach") - == 'true' + == "true" ) except Py4JJavaError as e: result = True @@ -99,7 +99,7 @@ def auto_attach(self): lib = JavaJarId( JarURI, ManagedLibraryId.defaultOrganization(), - NoVersionModule.simpleString() + NoVersionModule.simpleString(), ) libSeq = converters.asScalaBufferConverter((lib,)).asScala().toSeq() diff --git a/python/mosaic/models/knn/spatial_knn.py b/python/mosaic/models/knn/spatial_knn.py index e5b10a394..c1625841f 100644 --- a/python/mosaic/models/knn/spatial_knn.py +++ b/python/mosaic/models/knn/spatial_knn.py @@ -3,7 +3,6 @@ class SpatialKNN: - """ SpatialKNN is a distributed KNN model that uses a spatial index to reduce the number of candidate records to consider for each query record. The model is built on top of the Spark DataFrame API and is designed to be diff --git a/python/mosaic/utils/display_handler.py b/python/mosaic/utils/display_handler.py index d7b8d8f65..40d27967c 100644 --- a/python/mosaic/utils/display_handler.py +++ b/python/mosaic/utils/display_handler.py @@ -28,7 +28,7 @@ def __init__(self, spark: SparkSession): def display_dataframe(self, df: DataFrame): prettifier = self.PrettifierModule - pretty_jdf = (prettifier.prettified(df._jdf, self.ScalaOptionObject.apply(None))) + pretty_jdf = prettifier.prettified(df._jdf, self.ScalaOptionObject.apply(None)) pretty_df = DataFrame(pretty_jdf, config.sql_context) self.dataframe_display_function(pretty_df) diff --git a/python/mosaic/utils/kepler_magic.py b/python/mosaic/utils/kepler_magic.py index 66dd418fb..0d38d632f 100644 --- a/python/mosaic/utils/kepler_magic.py +++ b/python/mosaic/utils/kepler_magic.py @@ -1,6 +1,5 @@ import re -import h3 import pandas as pd from IPython.core.magic import Magics, cell_magic, magics_class from keplergl import KeplerGl @@ -24,7 +23,6 @@ @magics_class class MosaicKepler(Magics): - """ A magic command for visualizing data in KeplerGl. """ diff --git a/python/test/data/Blocks2020.zip b/python/test/data/Blocks2020.zip new file mode 100644 index 000000000..7c367746e Binary files /dev/null and b/python/test/data/Blocks2020.zip differ diff --git a/python/test/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc b/python/test/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc new file mode 100644 index 000000000..33a03c5af Binary files /dev/null and b/python/test/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc differ diff --git a/python/test/test_display_handler.py b/python/test/test_display_handler.py index ed1661f12..b9441624f 100644 --- a/python/test/test_display_handler.py +++ b/python/test/test_display_handler.py @@ -6,7 +6,7 @@ class TestDisplayHandler(MosaicTestCase): def setUp(self) -> None: return super().setUp() - + def test_display(self): df = self.wkt_boroughs() poly_df = df.select(st_makepolygon(st_geomfromwkt("wkt")).alias("polygon_geom")) diff --git a/python/test/test_fuse_install.py b/python/test/test_fuse_install.py index 7c2208b18..90a3b9c1b 100644 --- a/python/test/test_fuse_install.py +++ b/python/test/test_fuse_install.py @@ -11,8 +11,8 @@ def test_setup_no_op(self): self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - - self.assertEqual(len(installer.list_files()), 0) # <- nothing generated + + self.assertEqual(len(installer.list_files()), 0) # <- nothing generated def test_setup_jar_only(self): installer = FuseInstaller(False, False, jar_copy=True, jni_so_copy=False) @@ -20,7 +20,7 @@ def test_setup_jar_only(self): self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - + def test_setup_sh_pip_only(self): installer = FuseInstaller(True, False, jar_copy=False, jni_so_copy=False) try: @@ -28,7 +28,7 @@ def test_setup_sh_pip_only(self): except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - self.assertEqual(len(installer.list_files()), 1) # <- just init script + self.assertEqual(len(installer.list_files()), 1) # <- just init script def test_setup_sh_gdal(self): installer = FuseInstaller(False, True, jar_copy=False, jni_so_copy=False) @@ -36,8 +36,8 @@ def test_setup_sh_gdal(self): self.assertTrue(installer.do_op()) except Exception: self.fail("Executing `setup_fuse_install()` raised an exception.") - - self.assertEqual(len(installer.list_files()), 1) # <- just init script + + self.assertEqual(len(installer.list_files()), 1) # <- just init script def test_setup_sh_gdal_jni(self): installer = FuseInstaller(False, True, jar_copy=False, jni_so_copy=True) diff --git a/python/test/test_gdal_install.py b/python/test/test_gdal_install.py index 340b96841..f98a3d8b2 100644 --- a/python/test/test_gdal_install.py +++ b/python/test/test_gdal_install.py @@ -11,8 +11,10 @@ def test_setup_gdal(self): self.assertTrue(installer.do_op()) except Exception: self.fail("Copying objects with `setup_gdal()` raised an exception.") - - self.assertEqual(len(installer.list_files()), 1) # <- just init script + + self.assertEqual( + len(installer.list_files()), 4 + ) # <- init script and shared objs try: installer_result = installer.run_init_script() diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index f8e09b6f5..be5fd4656 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -1,13 +1,13 @@ -from pyspark.sql.functions import abs, col, first, lit, sqrt, array +from pyspark.sql.functions import abs, col, first, lit, sqrt, array, element_at -from .context import api +from .context import api, readers from .utils import MosaicTestCaseWithGDAL class TestRasterFunctions(MosaicTestCaseWithGDAL): def setUp(self) -> None: return super().setUp() - + def test_read_raster(self): result = self.generate_singleband_raster_df().first() self.assertEqual(result.length, 1067862) @@ -115,11 +115,15 @@ def test_raster_flatmap_functions(self): ) tessellate_result.write.format("noop").mode("overwrite").save() - self.assertEqual(tessellate_result.count(), 55) + self.assertEqual(tessellate_result.count(), 66) - overlap_result = self.generate_singleband_raster_df().withColumn( - "rst_to_overlapping_tiles", - api.rst_to_overlapping_tiles("tile", lit(200), lit(200), lit(10)), + overlap_result = ( + self.generate_singleband_raster_df() + .withColumn( + "rst_to_overlapping_tiles", + api.rst_to_overlapping_tiles("tile", lit(200), lit(200), lit(10)), + ) + .withColumn("rst_subdatasets", api.rst_subdatasets("tile")) ) overlap_result.write.format("noop").mode("overwrite").save() @@ -142,7 +146,9 @@ def test_raster_aggregator_functions(self): ) self.assertEqual(merge_result.count(), 1) - self.assertEqual(collection.first()["extent"], merge_result.first()["extent"]) + self.assertEqual( + collection.select("extent").first(), merge_result.select("extent").first() + ) combine_avg_result = ( collection.groupBy("path") @@ -152,5 +158,70 @@ def test_raster_aggregator_functions(self): self.assertEqual(combine_avg_result.count(), 1) self.assertEqual( - collection.first()["extent"], combine_avg_result.first()["extent"] + collection.select("extent").first(), + combine_avg_result.select("extent").first(), + ) + + def test_netcdf_load_tessellate_clip_merge(self): + target_resolution = 1 + + region_keys = ["NAME", "STATE", "BOROUGH", "BLOCK", "TRACT"] + + census_df = ( + readers.read() + .format("multi_read_ogr") + .option("vsizip", "true") + .option("chunkSize", "20") + .load("test/data/Blocks2020.zip") + .select(*region_keys, "geom_0", "geom_0_srid") + .dropDuplicates() + .withColumn("geom_0", api.st_simplify("geom_0", lit(0.001))) + .withColumn( + "geom_0", api.st_updatesrid("geom_0", col("geom_0_srid"), lit(4326)) + ) + .withColumn( + "chip", api.grid_tessellateexplode("geom_0", lit(target_resolution)) + ) + .select(*region_keys, "chip.*") + ) + + df = ( + self.spark.read.format("gdal") + .option("raster.read.strategy", "retile_on_read") + .load( + "test/data/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc" + ) + .select(api.rst_separatebands("tile").alias("tile")) + .withColumn( + "timestep", + element_at( + api.rst_metadata("tile"), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX" + ), + ) + .withColumn("tile", api.rst_setsrid("tile", lit(4326))) + .where(col("timestep") == 21) + .withColumn( + "tile", api.rst_to_overlapping_tiles("tile", lit(20), lit(20), lit(10)) + ) + .repartition(self.spark.sparkContext.defaultParallelism) ) + + prh_bands_indexed = df.withColumn( + "tile", api.rst_tessellate("tile", lit(target_resolution)) + ) + + clipped_precipitation = ( + prh_bands_indexed.alias("var") + .join( + census_df.alias("aoi"), + how="inner", + on=col("var.tile.index_id") == col("aoi.index_id"), + ) + .withColumn("tile", api.rst_clip("var.tile", "aoi.wkb")) + ) + + merged_precipitation = clipped_precipitation.groupBy(*region_keys).agg( + api.rst_merge_agg("tile").alias("tile") + ) + + self.assertEqual(merged_precipitation.count(), 1) diff --git a/python/test/test_vector_functions.py b/python/test/test_vector_functions.py index 8dbb191a9..afca19778 100644 --- a/python/test/test_vector_functions.py +++ b/python/test/test_vector_functions.py @@ -9,7 +9,7 @@ class TestVectorFunctions(MosaicTestCase): def setUp(self) -> None: return super().setUp() - + def test_st_point(self): expected = [ "POINT (0 0)", @@ -188,12 +188,12 @@ def test_aggregation_functions(self): .join(right_df, col("left_index.index_id") == col("right_index.index_id")) .groupBy("left_id", "right_id") .agg( - api.st_intersects_agg( - col("left_index"), col("right_index") - ).alias("agg_intersects"), - api.st_intersection_agg( - col("left_index"), col("right_index") - ).alias("agg_intersection"), + api.st_intersects_agg(col("left_index"), col("right_index")).alias( + "agg_intersects" + ), + api.st_intersection_agg(col("left_index"), col("right_index")).alias( + "agg_intersection" + ), first("left_geom").alias("left_geom"), first("right_geom").alias("right_geom"), ) diff --git a/python/test/utils/mosaic_test_case.py b/python/test/utils/mosaic_test_case.py index 986f9dc79..30437c265 100644 --- a/python/test/utils/mosaic_test_case.py +++ b/python/test/utils/mosaic_test_case.py @@ -16,7 +16,7 @@ def setUp(self) -> None: def setUpClass(cls) -> None: super().setUpClass() api.enable_mosaic(cls.spark) - + def setUp(self) -> None: return super().setUp() diff --git a/python/test/utils/mosaic_test_case_with_gdal.py b/python/test/utils/mosaic_test_case_with_gdal.py index cbcf1aa13..046e71674 100644 --- a/python/test/utils/mosaic_test_case_with_gdal.py +++ b/python/test/utils/mosaic_test_case_with_gdal.py @@ -8,7 +8,7 @@ class MosaicTestCaseWithGDAL(MosaicTestCase): def setUp(self) -> None: return super().setUp() - + @classmethod def setUpClass(cls) -> None: super().setUpClass() diff --git a/python/test/utils/setup_fuse.py b/python/test/utils/setup_fuse.py index a10a4db66..5bdc2b472 100644 --- a/python/test/utils/setup_fuse.py +++ b/python/test/utils/setup_fuse.py @@ -6,11 +6,9 @@ import subprocess import tempfile + class FuseInstaller: - def __init__( - self, with_mosaic_pip, with_gdal, - jar_copy = False, jni_so_copy = False - ): + def __init__(self, with_mosaic_pip, with_gdal, jar_copy=False, jni_so_copy=False): self._site_packages = working_set.find(Requirement("keplergl")).location self._temp_dir = tempfile.mkdtemp() self.with_mosaic_pip = with_mosaic_pip @@ -28,9 +26,9 @@ def do_op(self) -> bool: self.with_mosaic_pip, self.with_gdal, jar_copy=self.jar_copy, - jni_so_copy=self.jni_so_copy, + jni_so_copy=self.jni_so_copy, override_mosaic_version="main", - script_out_name=self.FUSE_INIT_SCRIPT_FILENAME + script_out_name=self.FUSE_INIT_SCRIPT_FILENAME, ) def run_init_script(self) -> int: @@ -45,5 +43,5 @@ def run_init_script(self) -> int: ) return result.returncode - def list_files(self) ->list[str]: + def list_files(self) -> list[str]: return os.listdir(self._temp_dir) diff --git a/python/test/utils/setup_gdal.py b/python/test/utils/setup_gdal.py index 097ee77e2..a62c4207e 100644 --- a/python/test/utils/setup_gdal.py +++ b/python/test/utils/setup_gdal.py @@ -1,10 +1,11 @@ -from pkg_resources import working_set, Requirement -from test.context import api - import os import shutil import subprocess import tempfile +from pkg_resources import working_set, Requirement + +from test.context import api + class GDALInstaller: def __init__(self): @@ -17,9 +18,10 @@ def __del__(self): def do_op(self) -> bool: return api.setup_gdal( - to_fuse_dir = self._temp_dir, + to_fuse_dir=self._temp_dir, override_mosaic_version="main", - script_out_name=self.GDAL_INIT_SCRIPT_FILENAME + script_out_name=self.GDAL_INIT_SCRIPT_FILENAME, + jni_so_copy=True, ) def run_init_script(self) -> int: @@ -33,7 +35,7 @@ def run_init_script(self) -> int: env=dict(os.environ, DATABRICKS_ROOT_VIRTUALENV_ENV=self._site_packages), ) return result.returncode - + def list_files(self) -> list[str]: return os.listdir(self._temp_dir) diff --git a/python/test/utils/spark_test_case.py b/python/test/utils/spark_test_case.py index 98d2743bf..6ae23b1b3 100644 --- a/python/test/utils/spark_test_case.py +++ b/python/test/utils/spark_test_case.py @@ -1,30 +1,40 @@ +import unittest +import os from importlib.metadata import version + from pyspark.sql import SparkSession import mosaic -import os -import unittest + class SparkTestCase(unittest.TestCase): spark = None library_location = None log4jref = None - + @classmethod def setUpClass(cls) -> None: cls.library_location = f"{mosaic.__path__[0]}/lib/mosaic-{version('databricks-mosaic')}-jar-with-dependencies.jar" if not os.path.exists(cls.library_location): cls.library_location = f"{mosaic.__path__[0]}/lib/mosaic-{version('databricks-mosaic')}-SNAPSHOT-jar-with-dependencies.jar" + cls.spark = ( - SparkSession.builder.master("local") + SparkSession.builder.master("local[*]") .config("spark.jars", cls.library_location) - .config("spark.driver.extraJavaOptions", "-Dorg.apache.logging.log4j.level=FATAL") - .config("spark.executor.extraJavaOptions", "-Dorg.apache.logging.log4j.level=FATAL") + .config("spark.driver.memory", "4g") + .config( + "spark.driver.extraJavaOptions", + "-Dorg.apache.logging.log4j.level=FATAL", + ) + .config( + "spark.executor.extraJavaOptions", + "-Dorg.apache.logging.log4j.level=FATAL", + ) .getOrCreate() ) cls.spark.conf.set("spark.databricks.labs.mosaic.jar.autoattach", "false") cls.spark.sparkContext.setLogLevel("FATAL") - + @classmethod def tearDownClass(cls) -> None: cls.spark.stop() diff --git a/src/main/scala/com/databricks/labs/mosaic/core/Mosaic.scala b/src/main/scala/com/databricks/labs/mosaic/core/Mosaic.scala index 57dac3db6..e00ecf3e4 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/Mosaic.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/Mosaic.scala @@ -11,6 +11,7 @@ import com.databricks.labs.mosaic.core.types.model.{GeometryTypeEnum, MosaicChip import com.databricks.labs.mosaic.core.types.model.GeometryTypeEnum._ import scala.annotation.tailrec +import scala.util.{Failure, Success, Try} /** * Single abstracted logic for mosaic fill via [[IndexSystem]]. [[IndexSystem]] @@ -66,22 +67,33 @@ object Mosaic { ): Seq[MosaicChip] = { val radius = indexSystem.getBufferRadius(geometry, resolution, geometryAPI) - // do not modify the radius val carvedGeometry = geometry.buffer(-radius) + + // add 1% to the radius to ensure union of carved and border geometries does not have holes inside the original geometry areas val borderGeometry = if (carvedGeometry.isEmpty) { - geometry.buffer(radius * 1.01).simplify(0.01 * radius) + geometry + .buffer(radius * 1.01) + .simplify(0.01 * radius) } else { - geometry.boundary.buffer(radius * 1.01).simplify(0.01 * radius) + geometry.boundary + .buffer(radius * 1.01) + .simplify(0.01 * radius) } - val coreIndices = indexSystem.polyfill(carvedGeometry, resolution, Some(geometryAPI)) - val borderIndices = indexSystem.polyfill(borderGeometry, resolution, Some(geometryAPI)).diff(coreIndices) + // check that the resulting geometry is within the bounds of + // the coordinate system (otherwise behaviour will be unpredictable) + val originalGeometryConstrained = indexSystem.alignToGrid(geometry) + val carvedGeometryConstrained = indexSystem.alignToGrid(carvedGeometry) + val borderGeometryConstrained = indexSystem.alignToGrid(borderGeometry) + + val coreIndices = indexSystem.polyfill(carvedGeometryConstrained, resolution, geometryAPI) + val borderIndices = indexSystem.polyfill(borderGeometryConstrained, resolution, geometryAPI).diff(coreIndices) val coreChips = indexSystem.getCoreChips(coreIndices, keepCoreGeom, geometryAPI) - val borderChips = indexSystem.getBorderChips(geometry, borderIndices, keepCoreGeom, geometryAPI) + val borderChips = indexSystem.getBorderChips(originalGeometryConstrained, borderIndices, keepCoreGeom, geometryAPI) coreChips ++ borderChips } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometry.scala b/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometry.scala index 3c639446e..3f000f7b3 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometry.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometry.scala @@ -186,4 +186,6 @@ trait MosaicGeometry extends GeometryWriter with Serializable { ) } + def getAPI: GeometryAPI + } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometryJTS.scala b/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometryJTS.scala index e860d0b67..56a454c2f 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometryJTS.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/geometry/MosaicGeometryJTS.scala @@ -1,5 +1,6 @@ package com.databricks.labs.mosaic.core.geometry +import com.databricks.labs.mosaic.core.geometry.api.{GeometryAPI, JTS} import com.databricks.labs.mosaic.core.geometry.geometrycollection.MosaicGeometryCollectionJTS import com.databricks.labs.mosaic.core.geometry.linestring.MosaicLineStringJTS import com.databricks.labs.mosaic.core.geometry.multilinestring.MosaicMultiLineStringJTS @@ -221,6 +222,7 @@ abstract class MosaicGeometryJTS(geom: Geometry) extends MosaicGeometry { override def transformCRSXY(sridTo: Int): MosaicGeometryJTS = super.transformCRSXY(sridTo, None).asInstanceOf[MosaicGeometryJTS] + override def getAPI: GeometryAPI = JTS } object MosaicGeometryJTS extends GeometryReader { diff --git a/src/main/scala/com/databricks/labs/mosaic/core/geometry/api/GeometryAPI.scala b/src/main/scala/com/databricks/labs/mosaic/core/geometry/api/GeometryAPI.scala index 68815cb8b..18f3aae1d 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/geometry/api/GeometryAPI.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/geometry/api/GeometryAPI.scala @@ -2,6 +2,7 @@ package com.databricks.labs.mosaic.core.geometry.api import com.databricks.labs.mosaic.MOSAIC_GEOMETRY_API import com.databricks.labs.mosaic.codegen.format._ +import com.databricks.labs.mosaic.core.crs.CRSBoundsProvider import com.databricks.labs.mosaic.core.geometry._ import com.databricks.labs.mosaic.core.geometry.point._ import com.databricks.labs.mosaic.core.types._ @@ -18,12 +19,16 @@ abstract class GeometryAPI( ) extends Serializable { def createBbox(xMin: Double, yMin: Double, xMax: Double, yMax: Double): MosaicGeometry = { - val p1 = fromGeoCoord(Coordinates(xMin, yMin)) - val p2 = fromGeoCoord(Coordinates(xMin, yMax)) - val p3 = fromGeoCoord(Coordinates(xMax, yMax)) - val p4 = fromGeoCoord(Coordinates(xMax, yMin)) - val p5 = fromGeoCoord(Coordinates(xMin, yMin)) - geometry(Seq(p1, p2, p3, p4, p5), GeometryTypeEnum.POLYGON) + val p1 = fromGeoCoord(Coordinates(yMin, xMin)) + val p2 = fromGeoCoord(Coordinates(yMax, xMin)) + val p3 = fromGeoCoord(Coordinates(yMax, xMax)) + val p4 = fromGeoCoord(Coordinates(yMin, xMax)) + geometry(Seq(p1, p2, p3, p4, p1), GeometryTypeEnum.POLYGON) + } + + def geographicExtent(spatialReferenceID: Int): MosaicGeometry = { + val bounds = CRSBoundsProvider(this).reprojectedBounds("EPSG", spatialReferenceID) + createBbox(bounds.lowerLeft.getX, bounds.lowerLeft.getY, bounds.upperRight.getX, bounds.upperRight.getY) } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/index/BNGIndexSystem.scala b/src/main/scala/com/databricks/labs/mosaic/core/index/BNGIndexSystem.scala index f325ae8aa..bc424dda4 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/index/BNGIndexSystem.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/index/BNGIndexSystem.scala @@ -182,11 +182,11 @@ object BNGIndexSystem extends IndexSystem(StringType) with Serializable { * @return * A set of indices representing the input geometry. */ - override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI]): Seq[Long] = { - require(geometryAPI.isDefined, "GeometryAPI cannot be None for BNG Index System.") + override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] = { +// require(geometryAPI.isDefined, "GeometryAPI cannot be None for BNG Index System.") @tailrec def visit(queue: Set[Long], visited: Set[Long], result: Set[Long]): Set[Long] = { - val visits = queue.map(index => (index, geometry.contains(indexToGeometry(index, geometryAPI.get).getCentroid))) + val visits = queue.map(index => (index, geometry.contains(indexToGeometry(index, geometryAPI).getCentroid))) val matches = visits.filter(_._2) val newVisited = visited ++ visits.map(_._1) val newQueue = matches.flatMap(c => kLoop(c._1, 1).filterNot(newVisited.contains)) @@ -371,19 +371,6 @@ object BNGIndexSystem extends IndexSystem(StringType) with Serializable { */ override def resolutions: Set[Int] = Set(1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6) - /** - * Get the geometry corresponding to the index with the input id. - * - * @param index - * Id of the index whose geometry should be returned. - * @return - * An instance of [[MosaicGeometry]] corresponding to index. - */ - override def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry = { - val indexId = parse(index) - indexToGeometry(indexId, geometryAPI) - } - /** * Provides a long representation from a string representation of a BNG * index id. The string representations follows letter prefix followed by diff --git a/src/main/scala/com/databricks/labs/mosaic/core/index/CustomIndexSystem.scala b/src/main/scala/com/databricks/labs/mosaic/core/index/CustomIndexSystem.scala index 73c066a28..a5b64cf12 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/index/CustomIndexSystem.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/index/CustomIndexSystem.scala @@ -142,8 +142,8 @@ case class CustomIndexSystem(conf: GridConf) extends IndexSystem(LongType) with * @return * A set of indices representing the input geometry. */ - override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI]): Seq[Long] = { - require(geometryAPI.isDefined, "GeometryAPI cannot be None.") + override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] = { +// require(geometryAPI.isDefined, "GeometryAPI cannot be None.") if (geometry.isEmpty) { return Seq[Long]() } @@ -170,7 +170,7 @@ case class CustomIndexSystem(conf: GridConf) extends IndexSystem(LongType) with val result = cellCenters // Select only cells which center falls within the geometry - .filter(cell => geometry.contains(geometryAPI.get.fromGeoCoord(Coordinates(cell._2, cell._1)))) + .filter(cell => geometry.contains(geometryAPI.fromGeoCoord(Coordinates(cell._2, cell._1)))) // Extract cellIDs only .map(cell => pointToIndex(cell._1, cell._2, resolution)) @@ -231,18 +231,6 @@ case class CustomIndexSystem(conf: GridConf) extends IndexSystem(LongType) with geometryAPI.geometry(Seq(p1, p2, p3, p4, p1), POLYGON) } - /** - * Get the geometry corresponding to the index with the input id. - * - * @param index - * Id of the index whose geometry should be returned. - * @return - * An instance of [[MosaicGeometry]] corresponding to index. - */ - override def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry = { - indexToGeometry(index.toLong, geometryAPI) - } - /** * Get the index ID corresponding to the provided coordinates. * diff --git a/src/main/scala/com/databricks/labs/mosaic/core/index/H3IndexSystem.scala b/src/main/scala/com/databricks/labs/mosaic/core/index/H3IndexSystem.scala index 8b5c2e6c5..740f2c9ed 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/index/H3IndexSystem.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/index/H3IndexSystem.scala @@ -2,7 +2,7 @@ package com.databricks.labs.mosaic.core.index import com.databricks.labs.mosaic.core.geometry.MosaicGeometry import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI -import com.databricks.labs.mosaic.core.types.model.Coordinates +import com.databricks.labs.mosaic.core.types.model.{Coordinates, GeometryTypeEnum} import com.databricks.labs.mosaic.core.types.model.GeometryTypeEnum.{LINESTRING, POLYGON} import com.uber.h3core.H3Core import com.uber.h3core.util.GeoCoord @@ -11,6 +11,7 @@ import org.apache.spark.unsafe.types.UTF8String import org.locationtech.jts.geom.Geometry import scala.collection.JavaConverters._ +//import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` import scala.collection.mutable import scala.util.{Success, Try} @@ -26,6 +27,8 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { val name = "H3" + override def isCylindrical: Boolean = true + // An instance of H3Core to be used for IndexSystem implementation. @transient private val h3: H3Core = H3Core.newInstance() @@ -74,12 +77,16 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { * when performing polyfill. */ override def getBufferRadius(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Double = { - val centroid = geometry.getCentroid + val centroid = geometry.getCentroid.mapXY((x, y) => if (x > 180) (-180 + x % 180, y) else if (x < -180) (180 - x % 180, y) else (x, y)).getCentroid val centroidIndex = h3.geoToH3(centroid.getY, centroid.getX, resolution) val indexGeom = indexToGeometry(centroidIndex, geometryAPI) - val boundary = indexGeom.getShellPoints.head // first shell is always in head - val indexGeomCentroid = indexGeom.getCentroid - boundary.map(_.distance(indexGeomCentroid)).max + GeometryTypeEnum.fromString(indexGeom.getGeometryType) match { + case POLYGON => + val boundary = indexGeom.getShellPoints.head // first shell is always in head + boundary.map(_.distance(centroid)).max + case _ => + indexGeom.flatten.flatMap(_.boundary.flatten).maxBy(_.getLength).getLength + } } /** @@ -97,13 +104,22 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { val boundary = h3.h3ToGeoBoundary(index).asScala val extended = boundary ++ List(boundary.head) - val geom = - if (crossesNorthPole(index) || crossesSouthPole(index)) makePoleGeometry(boundary, crossesNorthPole(index), geometryAPI) + val geom = if (crossesNorthPole(index) || crossesSouthPole(index)) makePoleGeometry(boundary, crossesNorthPole(index), geometryAPI) else makeSafeGeometry(extended, geometryAPI) + geom.setSpatialReference(crsID) geom } + override def alignToGrid(geometry: MosaicGeometry): MosaicGeometry = { + val extent = geometry.getAPI.geographicExtent(crsID) + val width = extent.minMaxCoord("X", "MAX") - extent.minMaxCoord("X", "MIN") + val central = geometry.intersection(extent) + val left = geometry.intersection(extent.translate(-width, 0)).translate(width, 0) + val right = geometry.intersection(extent.translate(width, 0)).translate(-width, 0) + central.union(left).union(right) + } + /** * H3 polyfill logic is based on the centroid point of the individual index * geometry. Blind spots do occur near the boundary of the geometry. @@ -115,9 +131,9 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { * @return * A set of indices representing the input geometry. */ - override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI] = None): Seq[Long] = { - if (geometry.isEmpty) Seq.empty[Long] - else { + override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] = { + + def geomToIndices(geometry: MosaicGeometry): Seq[Long] = { val shellPoints = geometry.getShellPoints val holePoints = geometry.getHolePoints (for (i <- 0 until geometry.getNumGeometries) yield { @@ -128,6 +144,13 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { indices.asScala.map(_.toLong) }).flatten } + + if (geometry.isEmpty) Seq.empty[Long] + else { + // split the geometry across the meridian + val westernHemi = geometryAPI.createBbox(-180.0, -90.0, 0.0, 90.0) + geomToIndices(geometry.intersection(westernHemi)) ++ geomToIndices(geometry.difference(westernHemi)) + } } /** @@ -192,26 +215,6 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { */ override def resolutions: Set[Int] = (0 to 15).toSet - /** - * Get the geometry corresponding to the index with the input id. - * - * @param index - * Id of the index whose geometry should be returned. - * @return - * An instance of [[MosaicGeometry]] corresponding to index. - */ - override def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry = { - val boundary = h3.h3ToGeoBoundary(index).asScala - val extended = boundary ++ List(boundary.head) - - val geom = - if (crossesNorthPole(index) || crossesSouthPole(index)) makePoleGeometry(boundary, crossesNorthPole(index), geometryAPI) - else makeSafeGeometry(extended, geometryAPI) - - geom.setSpatialReference(crsID) - geom - } - override def format(id: Long): String = { val geo = h3.h3ToGeo(id) h3.geoToH3Address(geo.lat, geo.lng, h3.h3GetResolution(id)) @@ -393,14 +396,17 @@ object H3IndexSystem extends IndexSystem(LongType) with Serializable { val unsafeGeometry = makeUnsafeGeometry(coordinates, geometryAPI) + makeSafeGeometry(geometryAPI, unsafeGeometry) + } + + private def makeSafeGeometry(geometryAPI: GeometryAPI, unsafeGeometry: MosaicGeometry) = { if (crossesAntiMeridian(unsafeGeometry)) { val shiftedGeometry = unsafeGeometry.mapXY(shiftEast) - val westGeom = shiftedGeometry.intersection(makeEastBBox(geometryAPI: GeometryAPI)) - val eastGeom = shiftedGeometry.intersection(makeShiftedWestBBox(geometryAPI: GeometryAPI)).mapXY(shiftWest) + val westGeom = shiftedGeometry.intersection(makeEastBBox(geometryAPI)) + val eastGeom = shiftedGeometry.intersection(makeShiftedWestBBox(geometryAPI)).mapXY(shiftWest) westGeom.union(eastGeom) } else { unsafeGeometry } } - } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/index/IndexSystem.scala b/src/main/scala/com/databricks/labs/mosaic/core/index/IndexSystem.scala index 64ea08c7a..e18b0335a 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/index/IndexSystem.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/index/IndexSystem.scala @@ -17,6 +17,8 @@ abstract class IndexSystem(var cellIdType: DataType) extends Serializable { // Passthrough if not redefined def isValid(cellID: Long): Boolean = true + def isCylindrical: Boolean = false + def crsID: Int /** @@ -145,6 +147,8 @@ abstract class IndexSystem(var cellIdType: DataType) extends Serializable { */ def getBufferRadius(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Double + def alignToGrid(geometry: MosaicGeometry): MosaicGeometry = geometry + /** * Returns a set of indices that represent the input geometry. Depending on * the index system this set may include only indices whose centroids fall @@ -159,7 +163,7 @@ abstract class IndexSystem(var cellIdType: DataType) extends Serializable { * @return * A set of indices representing the input geometry. */ - def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI] = None): Seq[Long] + def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] /** * @see @@ -218,16 +222,6 @@ abstract class IndexSystem(var cellIdType: DataType) extends Serializable { */ def indexToGeometry(index: Long, geometryAPI: GeometryAPI): MosaicGeometry - /** - * Get the geometry corresponding to the index with the input id. - * - * @param index - * Id of the index whose geometry should be returned. - * @return - * An instance of [[MosaicGeometry]] corresponding to index. - */ - def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry - /** * Get the index ID corresponding to the provided coordinates. * diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala index 3fa45f8e5..5ff3ceeaf 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala @@ -231,4 +231,13 @@ case class MosaicRasterBandGDAL(band: Band, id: Int) { */ def isNoDataMask: Boolean = band.GetMaskFlags() == gdalconstConstants.GMF_NODATA + /** + * @return + * Returns true if the band is empty. + */ + def isEmpty: Boolean = { + val stats = band.AsMDArray().GetStatistics() + stats.getValid_count == 0 + } + } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala index 4f51749dc..33f980748 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala @@ -4,6 +4,7 @@ import com.databricks.labs.mosaic.core.geometry.MosaicGeometry import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI import com.databricks.labs.mosaic.core.index.IndexSystem import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL.readRaster import com.databricks.labs.mosaic.core.raster.io.RasterCleaner.dispose import com.databricks.labs.mosaic.core.raster.io.{RasterCleaner, RasterReader, RasterWriter} import com.databricks.labs.mosaic.core.raster.operator.clip.RasterClipByVector @@ -19,7 +20,7 @@ import org.locationtech.proj4j.CRSFactory import java.nio.file.{Files, Paths, StandardCopyOption} import java.util.{Locale, Vector => JVector} import scala.collection.JavaConverters.dictionaryAsScalaMapConverter -import scala.util.Try +import scala.util.{Failure, Success, Try} /** GDAL implementation of the MosaicRaster trait. */ //noinspection DuplicatedCode @@ -34,7 +35,7 @@ case class MosaicRasterGDAL( def getSpatialReference: SpatialReference = { if (raster != null) { - raster.GetSpatialRef + spatialRef } else { val tmp = refresh() val result = tmp.spatialRef @@ -47,9 +48,9 @@ case class MosaicRasterGDAL( protected val crsFactory: CRSFactory = new CRSFactory // Only use this with GDAL rasters - private val wsg84 = new osr.SpatialReference() - wsg84.ImportFromEPSG(4326) - wsg84.SetAxisMappingStrategy(osr.osrConstants.OAMS_TRADITIONAL_GIS_ORDER) + private val wgs84 = new osr.SpatialReference() + wgs84.ImportFromEPSG(4326) + wgs84.SetAxisMappingStrategy(osr.osrConstants.OAMS_TRADITIONAL_GIS_ORDER) /** * @return @@ -152,7 +153,8 @@ case class MosaicRasterGDAL( * Returns the raster's subdatasets as a Map. */ def subdatasets: Map[String, String] = { - val dict = raster.GetMetadata_Dict("SUBDATASETS") + val dict = Try(raster.GetMetadata_Dict("SUBDATASETS")) + .getOrElse(new java.util.Hashtable[String, String]()) val subdatasetsMap = Option(dict) .map(_.asScala.toMap.asInstanceOf[Map[String, String]]) .getOrElse(Map.empty[String, String]) @@ -183,6 +185,25 @@ case class MosaicRasterGDAL( .toInt } + + /** + * @return + * Sets the raster's SRID. This is the EPSG code of the raster's CRS. + */ + def setSRID(srid: Int): MosaicRasterGDAL = { + val srs = new osr.SpatialReference() + srs.ImportFromEPSG(srid) + raster.SetSpatialRef(srs) + val driver = raster.GetDriver() + val newPath = PathUtils.createTmpFilePath(GDAL.getExtension(driverShortName)) + driver.CreateCopy(newPath, raster) + val newRaster = MosaicRasterGDAL.openRaster(newPath, Some(driverShortName)) + dispose(this) + MosaicRasterGDAL(newRaster, newPath, parentPath, driverShortName, -1) + } + + + /** * @return * Returns the raster's proj4 string. @@ -214,7 +235,13 @@ case class MosaicRasterGDAL( * @return * Returns the raster's number of bands. */ - def numBands: Int = raster.GetRasterCount() + def numBands: Int = { + val bandCount = Try(raster.GetRasterCount()) + bandCount match { + case Success(value) => value + case Failure(_) => 0 + } + } // noinspection ZeroIndexToHead /** @@ -257,7 +284,7 @@ case class MosaicRasterGDAL( * @return * Returns the raster's spatial reference. */ - def spatialRef: SpatialReference = raster.GetSpatialRef() + def spatialRef: SpatialReference = Option(raster.GetSpatialRef()).getOrElse(wgs84) /** * Applies a function to each band of the raster. @@ -272,7 +299,7 @@ case class MosaicRasterGDAL( * @return * Returns MosaicGeometry representing bounding box of the raster. */ - def bbox(geometryAPI: GeometryAPI, destCRS: SpatialReference = wsg84): MosaicGeometry = { + def bbox(geometryAPI: GeometryAPI, destCRS: SpatialReference = wgs84): MosaicGeometry = { val gt = getGeoTransform val sourceCRS = spatialRef @@ -300,23 +327,16 @@ case class MosaicRasterGDAL( * compute since it requires reading the raster and computing statistics. */ def isEmpty: Boolean = { - import org.json4s._ - import org.json4s.jackson.JsonMethods._ - implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats - - val vector = new JVector[String]() - vector.add("-stats") - vector.add("-json") - val infoOptions = new InfoOptions(vector) - val gdalInfo = GDALInfo(raster, infoOptions) - val json = parse(gdalInfo).extract[Map[String, Any]] - - if (json.contains("STATISTICS_VALID_PERCENT")) { - json("STATISTICS_VALID_PERCENT").asInstanceOf[Double] == 0.0 - } else if (subdatasets.nonEmpty) { - false + val bands = getBands + if (bands.isEmpty) { + subdatasets + .values + .filter(_.toLowerCase(Locale.ROOT).startsWith(driverShortName.toLowerCase(Locale.ROOT))) + .flatMap(readRaster(_, path).getBands) + .takeWhile(_.isEmpty) + .nonEmpty } else { - getBandStats.values.map(_.getOrElse("mean", 0.0)).forall(_ == 0.0) + bands.takeWhile(_.isEmpty).nonEmpty } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala index 6daabc25c..41c967947 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala @@ -47,7 +47,7 @@ object RasterClipByVector { val result = GDALWarp.executeWarp( resultFileName, Seq(raster), - command = s"gdalwarp -wo CUTLINE_ALL_TOUCHED=TRUE -of $outShortName -cutline $shapeFileName -crop_to_cutline -co COMPRESS=DEFLATE -dstalpha" + command = s"gdalwarp -wo CUTLINE_ALL_TOUCHED=TRUE -of $outShortName -cutline $shapeFileName -crop_to_cutline -co COMPRESS=DEFLATE" ) VectorClipper.cleanUpClipper(shapeFileName) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala index 97a273d13..c33e293d6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala @@ -8,7 +8,7 @@ import com.databricks.labs.mosaic.utils.SysUtils object GDALCalc { val gdal_calc: String = { - val calcPath = SysUtils.runCommand("find / -iname gdal_calc.py")._1.split("\n").headOption.getOrElse("") + val calcPath = SysUtils.runCommand("which gdal_calc.py")._1.split("\n").headOption.getOrElse("") if (calcPath.isEmpty) { throw new RuntimeException("Could not find gdal_calc.py.") } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala index d186de0a5..0b31519ea 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala @@ -41,7 +41,7 @@ object RasterTessellate { (false, MosaicRasterTile(cell.index, null, "", "")) } else { val cellRaster = tmpRaster.getRasterForCell(cellID, indexSystem, geometryAPI) - val isValidRaster = cellRaster.getBandStats.values.map(_("mean")).sum > 0 && !cellRaster.isEmpty + val isValidRaster = !cellRaster.isEmpty ( isValidRaster, MosaicRasterTile(cell.index, cellRaster, raster.getParentPath, raster.getDriversShortName) ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala index edaab4720..9a995b6b1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala @@ -32,10 +32,10 @@ object ReTile { val yTiles = Math.ceil(yR / tileHeight).toInt val tiles = for (x <- 0 until xTiles; y <- 0 until yTiles) yield { - val xMin = if (x == 0) x * tileWidth else x * tileWidth - 1 - val yMin = if (y == 0) y * tileHeight else y * tileHeight - 1 - val xOffset = if (xMin + tileWidth + 1 > xR) xR - xMin else tileWidth + 1 - val yOffset = if (yMin + tileHeight + 1 > yR) yR - yMin else tileHeight + 1 + val xMin = x * tileWidth + val yMin = y * tileHeight + val xOffset = if (xMin + tileWidth > xR) xR - xMin else tileWidth + val yOffset = if (yMin + tileHeight > yR) yR - yMin else tileHeight val fileExtension = raster.getRasterFileExtension val rasterPath = PathUtils.createTmpFilePath(fileExtension) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/separate/SeparateBands.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/separate/SeparateBands.scala new file mode 100644 index 000000000..25f73bf8b --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/separate/SeparateBands.scala @@ -0,0 +1,51 @@ +package com.databricks.labs.mosaic.core.raster.operator.separate + +import com.databricks.labs.mosaic.core.raster.io.RasterCleaner.dispose +import com.databricks.labs.mosaic.core.raster.operator.gdal.GDALTranslate +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.utils.PathUtils + +/** ReTile is a helper object for splitting multi-band rasters into single-band-per-row. */ +object SeparateBands { + + /** + * Separates raster bands into separate rasters. Empty bands are discarded. + * + * @param tile + * The raster to retile. + * @return + * A sequence of MosaicRasterTile objects. + */ + def separate( + tile: => MosaicRasterTile + ): Seq[MosaicRasterTile] = { + val raster = tile.getRaster + val tiles = for (i <- 0 until raster.numBands) yield { + val fileExtension = raster.getRasterFileExtension + val rasterPath = PathUtils.createTmpFilePath(fileExtension) + val shortDriver = raster.getDriversShortName + + val result = GDALTranslate.executeTranslate( + rasterPath, + raster, + command = s"gdal_translate -of $shortDriver -b ${i + 1} -co COMPRESS=DEFLATE" + ) + + val isEmpty = result.isEmpty + + result.raster.SetMetadataItem("MOSAIC_BAND_INDEX", (i + 1).toString) + result.raster.GetDriver().CreateCopy(result.path, result.raster) + + if (isEmpty) dispose(result) + + (isEmpty, result, i) + + } + + val (_, valid) = tiles.partition(_._1) + + valid.map(t => new MosaicRasterTile(null, t._2, raster.getParentPath, raster.getDriversShortName)) + + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala b/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala index e7a8e9218..798b11206 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala @@ -7,6 +7,8 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{BinaryType, DataType, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String +import scala.util.{Failure, Success, Try} + /** * A case class modeling an instance of a mosaic raster tile. * @@ -36,6 +38,7 @@ case class MosaicRasterTile( /** * Indicates whether the raster is present. + * * @return * True if the raster is present, false otherwise. */ @@ -46,7 +49,6 @@ case class MosaicRasterTile( * * @param indexSystem * Index system to use for formatting. - * * @return * MosaicChip with formatted index ID. */ @@ -55,13 +57,13 @@ case class MosaicRasterTile( (indexSystem.getCellIdDataType, index) match { case (_: LongType, Left(_)) => this case (_: StringType, Right(_)) => this - case (_: LongType, Right(value)) => MosaicRasterTile( + case (_: LongType, Right(value)) => new MosaicRasterTile( index = Left(indexSystem.parse(value)), raster = raster, parentPath = parentPath, driver = driver ) - case (_: StringType, Left(value)) => MosaicRasterTile( + case (_: StringType, Left(value)) => new MosaicRasterTile( index = Right(indexSystem.format(value)), raster = raster, parentPath = parentPath, @@ -76,7 +78,6 @@ case class MosaicRasterTile( * * @param indexSystem * Index system to use for formatting. - * * @return * MosaicChip with formatted index ID. */ @@ -88,6 +89,7 @@ case class MosaicRasterTile( /** * Formats the index ID as the string type. + * * @param indexSystem * Index system to use for formatting. * @return @@ -138,6 +140,11 @@ case class MosaicRasterTile( GDAL.writeRasters(Seq(raster), checkpointLocation, rasterDataType).head } + def getSequenceNumber: Int = + Try(raster.getRaster.GetMetadataItem("BAND_INDEX", "DATABRICKS_MOSAIC")) match { + case Success(value) => value.toInt + case Failure(_) => -1 + } } /** Companion object. */ @@ -162,12 +169,12 @@ object MosaicRasterTile { // noinspection TypeCheckCanBeMatch if (Option(index).isDefined) { if (index.isInstanceOf[Long]) { - MosaicRasterTile(Left(index.asInstanceOf[Long]), raster, parentPath, driver) + new MosaicRasterTile(Left(index.asInstanceOf[Long]), raster, parentPath, driver) } else { - MosaicRasterTile(Right(index.asInstanceOf[UTF8String].toString), raster, parentPath, driver) + new MosaicRasterTile(Right(index.asInstanceOf[UTF8String].toString), raster, parentPath, driver) } } else { - MosaicRasterTile(null, raster, parentPath, driver) + new MosaicRasterTile(null, raster, parentPath, driver) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala index 0517ac1d9..fe2f17148 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala @@ -95,7 +95,7 @@ object ReadInMemory extends ReadStrategy { case other => throw new RuntimeException(s"Unsupported field name: $other") } val rasterTileSer = InternalRow.fromSeq( - Seq(null, contentBytes, UTF8String.fromString(inPath), UTF8String.fromString(driverShortName)) + Seq(null, contentBytes, UTF8String.fromString(inPath), UTF8String.fromString(driverShortName), null) ) val row = Utils.createRow( fields ++ Seq(rasterTileSer) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/geometry/ST_IntersectionAgg.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/geometry/ST_IntersectionAgg.scala index 5de6e60d9..89e6d6f45 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/geometry/ST_IntersectionAgg.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/geometry/ST_IntersectionAgg.scala @@ -32,7 +32,7 @@ case class ST_IntersectionAgg( private[geometry] def getCellGeom(row: InternalRow, dt: DataType) = { dt.asInstanceOf[StructType].fields.find(_.name == "index_id").map(_.dataType) match { case Some(LongType) => indexSystem.indexToGeometry(row.getLong(1), geometryAPI) - case Some(StringType) => indexSystem.indexToGeometry(row.getString(1), geometryAPI) + case Some(StringType) => indexSystem.indexToGeometry(indexSystem.parse(row.getString(1)), geometryAPI) case _ => throw new Error("Unsupported format for chips.") } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/index/IndexGeometry.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/index/IndexGeometry.scala index 02a6da62f..cf423ab89 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/index/IndexGeometry.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/index/IndexGeometry.scala @@ -68,7 +68,7 @@ case class IndexGeometry(indexID: Expression, format: Expression, indexSystem: I val indexGeometry = indexID.dataType match { case LongType => indexSystem.indexToGeometry(input1.asInstanceOf[Long], geometryAPI) case IntegerType => indexSystem.indexToGeometry(input1.asInstanceOf[Int], geometryAPI) - case StringType => indexSystem.indexToGeometry(input1.asInstanceOf[UTF8String].toString, geometryAPI) + case StringType => indexSystem.indexToGeometry(indexSystem.parse(input1.asInstanceOf[UTF8String].toString), geometryAPI) case _ => throw new Error(s"${indexID.dataType} not supported.") } geometryAPI.serialize(indexGeometry, formatName) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/index/Polyfill.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/index/Polyfill.scala index e1c04861f..de13524fc 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/index/Polyfill.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/index/Polyfill.scala @@ -61,7 +61,7 @@ case class Polyfill(geom: Expression, resolution: Expression, indexSystem: Index val resolutionVal: Int = indexSystem.getResolution(input2) val geometry = geometryAPI.geometry(input1, geom.dataType) - val indices = indexSystem.polyfill(geometry, resolutionVal, Some(geometryAPI)) + val indices = indexSystem.polyfill(geometry, resolutionVal, geometryAPI) val formatted = indices.map(indexSystem.formatCellId) val serialized = ArrayData.toArrayData(formatted.toArray) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBands.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBands.scala new file mode 100644 index 000000000..395eb9704 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBands.scala @@ -0,0 +1,57 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.raster.operator.separate.SeparateBands +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} +import com.databricks.labs.mosaic.expressions.raster.base.RasterGeneratorExpression +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} + +/** + * Returns a set of new single-band rasters, one for each band in the input raster. + */ +case class RST_SeparateBands( + rasterExpr: Expression, + expressionConfig: MosaicExpressionConfig +) extends RasterGeneratorExpression[RST_SeparateBands](rasterExpr, expressionConfig) + with NullIntolerant + with CodegenFallback { + + /** + * Returns a set of new single-band rasters, one for each band in the input raster. + */ + override def rasterGenerator(tile: MosaicRasterTile): Seq[MosaicRasterTile] = { + SeparateBands.separate(tile) + } + + override def children: Seq[Expression] = Seq(rasterExpr) + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_SeparateBands extends WithExpressionInfo { + + override def name: String = "rst_separatebands" + + override def usage: String = + """ + |_FUNC_(expr1) - Separates raster bands into separate rasters. Empty bands are discarded. + |""".stripMargin + + override def example: String = + """ + | Examples: + | > SELECT _FUNC_(raster_tile); + | {index_id, raster_tile, parentPath, driver} + | {index_id, raster_tile, parentPath, driver} + | {index_id, raster_tile, parentPath, driver} + | ... + | """.stripMargin + + override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { + GenericExpressionFactory.getBaseBuilder[RST_SeparateBands](3, expressionConfig) + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRID.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRID.scala new file mode 100644 index 000000000..a3d44289a --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRID.scala @@ -0,0 +1,72 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.raster.io.RasterCleaner +import com.databricks.labs.mosaic.core.raster.operator.clip.RasterClipByVector +import com.databricks.labs.mosaic.core.types.RasterTileType +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} +import com.databricks.labs.mosaic.expressions.raster.base.Raster1ArgExpression +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} + +/** The expression for clipping a raster by a vector. */ +case class RST_SetSRID( + rastersExpr: Expression, + sridExpr: Expression, + expressionConfig: MosaicExpressionConfig +) extends Raster1ArgExpression[RST_SetSRID]( + rastersExpr, + sridExpr, + RasterTileType(expressionConfig.getCellIdType), + returnsRaster = true, + expressionConfig = expressionConfig + ) + with NullIntolerant + with CodegenFallback { + + val geometryAPI: GeometryAPI = GeometryAPI(expressionConfig.getGeometryAPI) + + /** + * Sets the SRID of raster tiles. + * + * @param tile + * The raster to be used. + * @param arg1 + * The SRID to be used. + * @return + * The updated raster tile. + */ + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { + val referenced = tile.getRaster.setSRID(arg1.asInstanceOf[Int]) + tile.copy(raster = referenced) + } + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_SetSRID extends WithExpressionInfo { + + override def name: String = "rst_setsrid" + + override def usage: String = + """ + |_FUNC_(expr1) - Force set the SRID of a raster. + |""".stripMargin + + override def example: String = + """ + | Examples: + | > SELECT _FUNC_(raster, srid); + | {index_id, raster, parentPath, driver} + | {index_id, raster, parentPath, driver} + | ... + | """.stripMargin + + override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { + GenericExpressionFactory.getBaseBuilder[RST_SetSRID](2, expressionConfig) + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 905d6962e..59f1cf9d5 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -287,6 +287,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends mosaicRegistry.registerExpression[RST_RasterToWorldCoordX](expressionConfig) mosaicRegistry.registerExpression[RST_RasterToWorldCoordY](expressionConfig) mosaicRegistry.registerExpression[RST_ReTile](expressionConfig) + mosaicRegistry.registerExpression[RST_SeparateBands](expressionConfig) mosaicRegistry.registerExpression[RST_Rotation](expressionConfig) mosaicRegistry.registerExpression[RST_ScaleX](expressionConfig) mosaicRegistry.registerExpression[RST_ScaleY](expressionConfig) @@ -294,6 +295,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends mosaicRegistry.registerExpression[RST_SkewX](expressionConfig) mosaicRegistry.registerExpression[RST_SkewY](expressionConfig) mosaicRegistry.registerExpression[RST_SRID](expressionConfig) + mosaicRegistry.registerExpression[RST_SetSRID](expressionConfig) mosaicRegistry.registerExpression[RST_Subdatasets](expressionConfig) mosaicRegistry.registerExpression[RST_Summary](expressionConfig) mosaicRegistry.registerExpression[RST_Tessellate](expressionConfig) @@ -702,6 +704,8 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends ColumnAdapter(RST_ReTile(raster.expr, tileWidth.expr, tileHeight.expr, expressionConfig)) def rst_retile(raster: Column, tileWidth: Int, tileHeight: Int): Column = ColumnAdapter(RST_ReTile(raster.expr, lit(tileWidth).expr, lit(tileHeight).expr, expressionConfig)) + def rst_separatebands(raster: Column): Column = + ColumnAdapter(RST_SeparateBands(raster.expr, expressionConfig)) def rst_rotation(raster: Column): Column = ColumnAdapter(RST_Rotation(raster.expr, expressionConfig)) def rst_scalex(raster: Column): Column = ColumnAdapter(RST_ScaleX(raster.expr, expressionConfig)) def rst_scaley(raster: Column): Column = ColumnAdapter(RST_ScaleY(raster.expr, expressionConfig)) @@ -711,6 +715,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def rst_skewx(raster: Column): Column = ColumnAdapter(RST_SkewX(raster.expr, expressionConfig)) def rst_skewy(raster: Column): Column = ColumnAdapter(RST_SkewY(raster.expr, expressionConfig)) def rst_srid(raster: Column): Column = ColumnAdapter(RST_SRID(raster.expr, expressionConfig)) + def rst_setsrid(raster: Column, srid: Column): Column = ColumnAdapter(RST_SetSRID(raster.expr, srid.expr, expressionConfig)) def rst_subdatasets(raster: Column): Column = ColumnAdapter(RST_Subdatasets(raster.expr, expressionConfig)) def rst_summary(raster: Column): Column = ColumnAdapter(RST_Summary(raster.expr, expressionConfig)) def rst_tessellate(raster: Column, resolution: Column): Column = diff --git a/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala index 9e8bf1132..d256a9870 100644 --- a/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala @@ -38,7 +38,7 @@ object MosaicGDAL extends Logging { gdal.SetConfigOption("GDAL_DISABLE_READDIR_ON_OPEN", "EMPTY_DIR") gdal.SetConfigOption("CPL_TMPDIR", CPL_TMPDIR) gdal.SetConfigOption("GDAL_PAM_PROXY_DIR", GDAL_PAM_PROXY_DIR) - gdal.SetConfigOption("GDAL_PAM_ENABLED", "NO") + gdal.SetConfigOption("GDAL_PAM_ENABLED", "YES") gdal.SetConfigOption("CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE", "NO") gdal.SetConfigOption("CPL_LOG", s"$CPL_TMPDIR/gdal.log") mosaicConfig.getGDALConf.foreach { case (k, v) => gdal.SetConfigOption(k.split("\\.").last, v) } diff --git a/src/test/resources/binary/netcdf-CMIP5/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc b/src/test/resources/binary/netcdf-CMIP5/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc new file mode 100644 index 000000000..33a03c5af Binary files /dev/null and b/src/test/resources/binary/netcdf-CMIP5/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc differ diff --git a/src/test/scala/com/databricks/labs/mosaic/core/index/H3IndexSystemTest.scala b/src/test/scala/com/databricks/labs/mosaic/core/index/H3IndexSystemTest.scala index a1bea4fcb..cb6e131aa 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/index/H3IndexSystemTest.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/index/H3IndexSystemTest.scala @@ -1,5 +1,6 @@ package com.databricks.labs.mosaic.core.index +import com.databricks.labs.mosaic.core.Mosaic.mosaicFill import com.databricks.labs.mosaic.core.geometry.api.{GeometryAPI, JTS} import com.databricks.labs.mosaic.core.geometry.MosaicGeometryJTS import com.databricks.labs.mosaic.core.index.H3IndexSystem.indexToGeometry @@ -20,15 +21,25 @@ class H3IndexSystemTest extends AnyFunSuite with Tolerance { val indexRes = H3IndexSystem.pointToIndex(10, 10, 10) noException shouldBe thrownBy { H3IndexSystem.format(indexRes) } noException shouldBe thrownBy { H3IndexSystem.getResolutionStr(10) } - noException shouldBe thrownBy { H3IndexSystem.indexToGeometry(H3IndexSystem.format(indexRes), JTS) } + noException shouldBe thrownBy { H3IndexSystem.indexToGeometry(H3IndexSystem.parse(H3IndexSystem.format(indexRes)), JTS) } an[IllegalArgumentException] shouldBe thrownBy { H3IndexSystem.getResolution(true) } an[IllegalStateException] shouldBe thrownBy { H3IndexSystem.getResolution("-1") } } test("H3IndexSystem polyfill signatures") { val geomJTS = MosaicGeometryJTS.fromWKT("POLYGON((1 2, 2 2, 2 1, 1 1, 1 2))") - noException shouldBe thrownBy { H3IndexSystem.polyfill(geomJTS, 10) } - noException shouldBe thrownBy { H3IndexSystem.polyfill(geomJTS, 10, Some(JTS)) } + val wrappedGeomJTS = MosaicGeometryJTS.fromWKT("POLYGON((179 2, 181 2, 181 1, 179 1, 179 2))") + noException shouldBe thrownBy { H3IndexSystem.polyfill(geomJTS, 10, JTS) } + noException shouldBe thrownBy { H3IndexSystem.polyfill(wrappedGeomJTS, 10, JTS) } + val expected = MosaicGeometryJTS.fromWKT("MULTIPOLYGON(((179 2, 180 2, 180 1, 179 1, 179 2)), ((-179 2, -180 2, -180 1, -179 1, -179 2)))") + val actualIndexes = mosaicFill(wrappedGeomJTS, 6, keepCoreGeom = true, H3IndexSystem, JTS) + val actual = actualIndexes.map(_.geom) + .reduce(_ union _) + actual.getArea - expected.getArea shouldEqual 0.0 +- 0.0001 + actual.minMaxCoord("x", "min") shouldEqual expected.minMaxCoord("x", "min") + actual.minMaxCoord("x", "max") shouldEqual expected.minMaxCoord("x", "max") + actual.minMaxCoord("y", "min") shouldEqual expected.minMaxCoord("y", "min") + actual.minMaxCoord("y", "max") shouldEqual expected.minMaxCoord("y", "max") } test("H3IndexSystem inherited methods") { @@ -138,7 +149,7 @@ class H3IndexSystemTest extends AnyFunSuite with Tolerance { geoms.foldLeft(0.0)((acc, geom) => acc + geom.getArea) shouldBe ((180.0 * 360.0) +- 0.0001) }) testCellsStr.foreach(cells => { - val geoms = cells.map(indexToGeometry(_, api)) + val geoms = cells.map(H3IndexSystem.parse).map(indexToGeometry(_, api)) geoms.foreach(geom => geom.isValid shouldBe true) geoms.foldLeft(0.0)((acc, geom) => acc + geom.getArea) shouldBe ((180.0 * 360.0) +- 0.0001) }) diff --git a/src/test/scala/com/databricks/labs/mosaic/core/index/TestBNGIndexSystem.scala b/src/test/scala/com/databricks/labs/mosaic/core/index/TestBNGIndexSystem.scala index ab8dca323..f45a8af7d 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/index/TestBNGIndexSystem.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/index/TestBNGIndexSystem.scala @@ -227,7 +227,7 @@ class TestBNGIndexSystem extends AnyFunSuite { BNGIndexSystem.getResolutionStr(4) shouldEqual "100m" BNGIndexSystem.getResolutionStr(-4) shouldEqual "500m" BNGIndexSystem.getResolutionStr(7) shouldEqual "" - an[Exception] should be thrownBy BNGIndexSystem.polyfill(null, 0, None) +// an[Exception] should be thrownBy BNGIndexSystem.polyfill(null, 0, None) } test("Issue 354: KRing should work near the edge of the grid") { diff --git a/src/test/scala/com/databricks/labs/mosaic/core/index/TestCustomIndexSystem.scala b/src/test/scala/com/databricks/labs/mosaic/core/index/TestCustomIndexSystem.scala index 022de18a4..74ba05439 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/index/TestCustomIndexSystem.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/index/TestCustomIndexSystem.scala @@ -137,15 +137,15 @@ class TestCustomIndexSystem extends AnyFunSuite { val resolutionMask = 0x01.toLong << 56 val geom = JTS.geometry("POLYGON ((0 0, 50 0, 50 50, 0 50, 0 0))", "WKT") - grid.polyfill(geom, 1, Some(JTS)).toSet shouldBe Set(0 | resolutionMask) + grid.polyfill(geom, 1, JTS).toSet shouldBe Set(0 | resolutionMask) // Geometry which cell center does not fall into does not get selected val geomSmall = JTS.geometry("POLYGON ((30 30, 40 30, 40 40, 30 40, 30 30))", "WKT") - grid.polyfill(geomSmall, 1, Some(JTS)).toSet shouldBe Set() + grid.polyfill(geomSmall, 1, JTS).toSet shouldBe Set() // Small geometry for which the cell center falls within should be detected val geomCentered = JTS.geometry("POLYGON ((24 24, 26 24, 26 26, 24 26, 24 24))", "WKT") - grid.polyfill(geomCentered, 1, Some(JTS)).toSet shouldBe Set(0 | resolutionMask) + grid.polyfill(geomCentered, 1, JTS).toSet shouldBe Set(0 | resolutionMask) } @@ -164,15 +164,15 @@ class TestCustomIndexSystem extends AnyFunSuite { grid.getCellPositionFromCoordinates(1.0, 1.0, resolution) shouldBe (4, 4, 36) val geom = JTS.geometry("POLYGON ((0 0, 25 0, 25 25, 0 25, 0 0))", "WKT") - grid.polyfill(geom, resolution, Some(JTS)).toSet shouldBe Set(36 | resolutionMask) + grid.polyfill(geom, resolution, JTS).toSet shouldBe Set(36 | resolutionMask) // Geometry which cell center does not fall into does not get selected val geomSmall = JTS.geometry("POLYGON ((0 0, 5 0, 5 5, 0 5, 0 0))", "WKT") - grid.polyfill(geomSmall, resolution, Some(JTS)).toSet shouldBe Set() + grid.polyfill(geomSmall, resolution, JTS).toSet shouldBe Set() // Small geometry for which the cell center falls within should be detected val geomCentered = JTS.geometry("POLYGON ((12 12, 13 12, 13 13, 12 13, 12 12))", "WKT") - grid.polyfill(geomCentered, resolution, Some(JTS)).toSet shouldBe Set(36 | resolutionMask) + grid.polyfill(geomCentered, resolution, JTS).toSet shouldBe Set(36 | resolutionMask) } @@ -192,7 +192,7 @@ class TestCustomIndexSystem extends AnyFunSuite { grid.getCellPositionFromCoordinates(1.0, 1.0, resolution) shouldBe (4, 4, 36) val geom = JTS.geometry("POLYGON ((-95 9, -50 9, -50 32, -95 32, -95 9))", "WKT") - grid.polyfill(geom, resolution, Some(JTS)).toSet shouldBe Set(34 | resolutionMask) + grid.polyfill(geom, resolution, JTS).toSet shouldBe Set(34 | resolutionMask) } @@ -204,7 +204,7 @@ class TestCustomIndexSystem extends AnyFunSuite { // Small geometry that spans multiple cels should be detected val geomMultiCell = JTS.geometry("POLYGON ((24 24, 76 24, 76 76, 24 76, 24 24))", "WKT") - grid.polyfill(geomMultiCell, 1, Some(JTS)).toSet shouldBe Set( + grid.polyfill(geomMultiCell, 1, JTS).toSet shouldBe Set( 0 | resolutionMask, 1 | resolutionMask, 2 | resolutionMask, @@ -213,7 +213,7 @@ class TestCustomIndexSystem extends AnyFunSuite { // Small geometry that spans multiple cels should be detected val geomAlmostMultiCell = JTS.geometry("POLYGON ((25 25, 75 25, 75 75, 25 75, 25 25))", "WKT") - grid.polyfill(geomAlmostMultiCell, 1, Some(JTS)).toSet shouldBe Set() + grid.polyfill(geomAlmostMultiCell, 1, JTS).toSet shouldBe Set() } @@ -223,7 +223,7 @@ class TestCustomIndexSystem extends AnyFunSuite { val grid = CustomIndexSystem(conf) val geom = JTS.geometry("POLYGON ((528435.784 6142513.9146, 528428.2785999998 6142513.0317, 528419.4486999996 6142513.9146, 528408.8529000003 6142513.9146, 528401.5744000003 6142515.895300001, 528396.9325000001 6142520.537, 528394.2835999997 6142526.718, 528393.4006000003 6142532.4574, 528395.608 6142538.6383, 528397.8154999996 6142540.845799999, 528402.2304999996 6142541.728800001, 528405.3208999997 6142542.1702, 528409.7357999999 6142543.053200001, 528414.5922999997 6142543.4947, 528418.5657000002 6142545.702199999, 528422.9807000002 6142548.7927, 528424.3051000005 6142552.766100001, 528429.1616000002 6142554.9736, 528433.1349999998 6142556.739499999, 528436.2254999997 6142555.8566, 528441.5234000003 6142554.090600001, 528445.3328999998 6142546.090700001, 528449.0288000004 6142536.872300001, 528448.6368000004 6142527.072899999, 528442.8479000004 6142517.8881, 528435.784 6142513.9146))", "WKT") - assert(grid.polyfill(geom, 4, Some(JTS)).nonEmpty) + assert(grid.polyfill(geom, 4, JTS).nonEmpty) } } diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/index/MosaicFillBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/index/MosaicFillBehaviors.scala index d4fbf78dd..44b443fc2 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/index/MosaicFillBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/index/MosaicFillBehaviors.scala @@ -3,7 +3,7 @@ package com.databricks.labs.mosaic.expressions.index import com.databricks.labs.mosaic.core.geometry.MosaicGeometry import com.databricks.labs.mosaic.core.index.{BNGIndexSystem, H3IndexSystem} import com.databricks.labs.mosaic.functions.MosaicContext -import com.databricks.labs.mosaic.test.{mocks, MosaicSpatialQueryTest} +import com.databricks.labs.mosaic.test.{MosaicSpatialQueryTest, mocks} import com.databricks.labs.mosaic.test.mocks.getBoroughs import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ @@ -11,6 +11,8 @@ import org.apache.spark.sql.functions.{size => arrayColumnSize} import org.apache.spark.sql.types._ import org.scalatest.matchers.should.Matchers._ +import scala.util.{Failure, Success, Try} + //noinspection ScalaDeprecation trait MosaicFillBehaviors extends MosaicSpatialQueryTest { @@ -183,10 +185,14 @@ trait MosaicFillBehaviors extends MosaicSpatialQueryTest { import sc.implicits._ import mc.functions._ - val geom = Seq("POLYGON ((5.26 52.72, 5.20 52.71, 5.21 52.75, 5.26 52.75, 5.26 52.72))").toDF("wkt") - val mosaics = geom + val geom = Seq("POLYGON ((-3.26 52.72, -3.20 52.71, -3.21 52.75, -3.26 52.75, -3.26 52.72))").toDF("wkt") + val geomProjected = Try(mc.getIndexSystem.crsID) match { + case Success(crsID) => geom.select(st_updatesrid(col("wkt"), lit(4326), lit(crsID)).alias("wkt")) + case Failure(_) => geom + } + val mosaics = geomProjected .select( - grid_tessellate(col("wkt"), 3).alias("tessellation") + grid_tessellate( col("wkt"), 3).alias("tessellation") ) .select(arrayColumnSize($"tessellation.chips").alias("number_of_chips")) .select($"number_of_chips") diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala index 03d82b955..fac40c9a9 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala @@ -47,7 +47,8 @@ trait RST_DerivedBandAggBehaviors extends QueryTest { // Do not indent the code in the SQL statement // It will be wrongly interpreted in python as broken - noException should be thrownBy spark.sql(""" +// noException should be thrownBy + spark.sql(""" |select rst_derivedband_agg( | tiles, |" diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsBehaviors.scala new file mode 100644 index 000000000..0da223907 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsBehaviors.scala @@ -0,0 +1,50 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions._ +import org.scalatest.matchers.should.Matchers._ + +trait RST_SeparateBandsBehaviors extends QueryTest { + + def separateBandsBehavior(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/binary/netcdf-CMIP5/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc") + + val df = rastersInMemory + .withColumn("result", rst_separatebands($"tile")) + .select("result") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql(""" + |select rst_separatebands(tile) from source + |""".stripMargin) + + noException should be thrownBy rastersInMemory + .withColumn("result", rst_separatebands($"tile")) + .withColumn("result", rst_separatebands($"tile")) + .select("result") + + val result = df.collect().length + + result should be > 0 + + an[Exception] should be thrownBy spark.sql(""" + |select rst_separatebands() from source + |""".stripMargin) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsTest.scala new file mode 100644 index 000000000..407b47acc --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SeparateBandsTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_SeparateBandsTest extends QueryTest with SharedSparkSessionGDAL with RST_SeparateBandsBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_SeparateBands with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + separateBandsBehavior(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDBehaviors.scala new file mode 100644 index 000000000..684333cd4 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDBehaviors.scala @@ -0,0 +1,52 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit +import org.scalatest.matchers.should.Matchers._ + +trait RST_SetSRIDBehaviors extends QueryTest { + + def setSRIDBehavior(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val df = rastersInMemory + .withColumn("result", rst_setsrid($"tile", lit(4326))) + .select("result") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql(""" + |select rst_setsrid(tile, 4326) from source + |""".stripMargin) + + noException should be thrownBy rastersInMemory + .withColumn("result", rst_setsrid($"tile", lit(4326))) + .select("result") + + val result = df + .where(rst_srid($"result") === lit(4326)) + .collect + .length + + result > 0 shouldBe true + + an[Exception] should be thrownBy spark.sql(""" + |select rst_setsrid() from source + |""".stripMargin) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDTest.scala new file mode 100644 index 000000000..200703ae4 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetSRIDTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_SetSRIDTest extends QueryTest with SharedSparkSessionGDAL with RST_SetSRIDBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_SetSRID with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + setSRIDBehavior(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TessellateBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TessellateBehaviors.scala index c346e82db..8804968e1 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TessellateBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TessellateBehaviors.scala @@ -4,6 +4,7 @@ import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI import com.databricks.labs.mosaic.core.index.IndexSystem import com.databricks.labs.mosaic.functions.MosaicContext import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit import org.scalatest.matchers.should.Matchers._ trait RST_TessellateBehaviors extends QueryTest { @@ -39,7 +40,22 @@ trait RST_TessellateBehaviors extends QueryTest { val result = gridTiles.collect() - result.length should be(380) + result.length should be(462) + + val netcdf = spark.read + .format("gdal") + .option("raster.read.strategy", "in-memory") + .load("src/test/resources/binary/netcdf-CMIP5/prAdjust_day_HadGEM2-CC_SMHI-DBSrev930-GFD-1981-2010-postproc_rcp45_r1i1p1_20201201-20201231.nc") + .withColumn("tile", rst_separatebands($"tile")) + .withColumn("tile", rst_setsrid($"tile", lit(4326))) + .limit(1) + + val netcdfGridTiles = netcdf + .select(rst_tessellate($"tile", lit(1)).alias("tile")) + + val netcdfResult = netcdfGridTiles.collect() + + netcdfResult.length should be(491) } diff --git a/src/test/scala/com/databricks/labs/mosaic/functions/auxiliary/BadIndexSystem.scala b/src/test/scala/com/databricks/labs/mosaic/functions/auxiliary/BadIndexSystem.scala index 162224242..e8cb685fb 100644 --- a/src/test/scala/com/databricks/labs/mosaic/functions/auxiliary/BadIndexSystem.scala +++ b/src/test/scala/com/databricks/labs/mosaic/functions/auxiliary/BadIndexSystem.scala @@ -29,13 +29,11 @@ object BadIndexSystem extends IndexSystem(BooleanType) { override def getBufferRadius(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Double = throw new UnsupportedOperationException - override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI]): Seq[Long] = + override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] = throw new UnsupportedOperationException override def indexToGeometry(index: Long, geometryAPI: GeometryAPI): MosaicGeometry = throw new UnsupportedOperationException - override def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry = throw new UnsupportedOperationException - override def pointToIndex(lon: Double, lat: Double, resolution: Int): Long = throw new UnsupportedOperationException override def parse(id: String): Long = throw new UnsupportedOperationException diff --git a/src/test/scala/com/databricks/labs/mosaic/test/package.scala b/src/test/scala/com/databricks/labs/mosaic/test/package.scala index 435ee552c..5d76fb48a 100644 --- a/src/test/scala/com/databricks/labs/mosaic/test/package.scala +++ b/src/test/scala/com/databricks/labs/mosaic/test/package.scala @@ -353,7 +353,7 @@ package object test { override def name: String = "MOCK" - override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: Option[GeometryAPI]): Seq[Long] = ??? + override def polyfill(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Seq[Long] = ??? override def format(id: Long): String = ??? @@ -371,8 +371,6 @@ package object test { override def indexToGeometry(index: Long, geometryAPI: GeometryAPI): MosaicGeometry = ??? - override def indexToGeometry(index: String, geometryAPI: GeometryAPI): MosaicGeometry = ??? - override def getBufferRadius(geometry: MosaicGeometry, resolution: Int, geometryAPI: GeometryAPI): Double = ??? override def parse(id: String): Long = ???