From 8af65868b67cc9288318259b4a165c9c95f8d9e4 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 23 Dec 2024 14:15:06 -0500 Subject: [PATCH] GeometryStreamBuilder (#965) ### Change list - Instead of using really complex code to figure out which underlying array to push each individual `xy` call to, we use an underlying `GeoWriter` to collect each geometry into a scratch buffer, and then push that geometry using our existing `push_geometry` APIs. This simplifies the code a _lot_. - It may have a negative performance impact because we're allocating extra heap buffers for the small vecs. In the future, if we want to optimize this, we may want to create a version of `geo-types` that uses something like bumpalo Closes https://github.com/geoarrow/geoarrow-rs/issues/365 --- python/Cargo.lock | 94 ++--- .../src/ffi/from_python/scalar.rs | 7 +- rust/geoarrow/src/array/geometry/builder.rs | 14 +- rust/geoarrow/src/array/mixed/builder.rs | 12 +- .../src/io/flatgeobuf/reader/async.rs | 14 +- .../geoarrow/src/io/flatgeobuf/reader/sync.rs | 14 +- rust/geoarrow/src/io/geojson/reader.rs | 4 +- rust/geoarrow/src/io/geojson_lines/reader.rs | 4 +- .../geoarrow/src/io/geozero/array/geometry.rs | 242 +++++++++++++ rust/geoarrow/src/io/geozero/array/mixed.rs | 338 ------------------ rust/geoarrow/src/io/geozero/array/mod.rs | 4 +- rust/geoarrow/src/io/geozero/mod.rs | 2 +- .../src/io/geozero/scalar/geometry.rs | 9 +- rust/geoarrow/src/io/postgis/reader.rs | 4 +- 14 files changed, 328 insertions(+), 434 deletions(-) create mode 100644 rust/geoarrow/src/io/geozero/array/geometry.rs delete mode 100644 rust/geoarrow/src/io/geozero/array/mixed.rs diff --git a/python/Cargo.lock b/python/Cargo.lock index 0bef08a0..fcc29183 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.94" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" [[package]] name = "approx" @@ -330,7 +330,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -341,7 +341,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -451,9 +451,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.20.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" [[package]] name = "byteorder" @@ -722,7 +722,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -759,7 +759,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -869,9 +869,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "foldhash" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" [[package]] name = "form_urlencoded" @@ -949,7 +949,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -1064,7 +1064,7 @@ dependencies = [ [[package]] name = "geoarrow" -version = "0.4.0-beta.2" +version = "0.4.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -1435,9 +1435,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.4" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6884a48c6826ec44f524c7456b163cebe9e55a18d7b5e307cb4f100371cc767" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http", @@ -1652,7 +1652,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -2151,7 +2151,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -2172,9 +2172,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] @@ -2360,7 +2360,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -2570,7 +2570,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -2583,7 +2583,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -2632,7 +2632,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.20", "socket2", - "thiserror 2.0.8", + "thiserror 2.0.9", "tokio", "tracing", ] @@ -2651,7 +2651,7 @@ dependencies = [ "rustls 0.23.20", "rustls-pki-types", "slab", - "thiserror 2.0.8", + "thiserror 2.0.9", "tinyvec", "tracing", "web-time", @@ -3097,14 +3097,14 @@ checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "itoa", "memchr", @@ -3211,7 +3211,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -3514,9 +3514,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035" dependencies = [ "proc-macro2", "quote", @@ -3540,7 +3540,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -3573,11 +3573,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.8" +version = "2.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a" +checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" dependencies = [ - "thiserror-impl 2.0.8", + "thiserror-impl 2.0.9", ] [[package]] @@ -3588,18 +3588,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] name = "thiserror-impl" -version = "2.0.8" +version = "2.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943" +checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -3653,9 +3653,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -3690,7 +3690,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -3770,7 +3770,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -3948,7 +3948,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", "wasm-bindgen-shared", ] @@ -3983,7 +3983,7 @@ checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4317,7 +4317,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", "synstructure", ] @@ -4339,7 +4339,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] @@ -4359,7 +4359,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", "synstructure", ] @@ -4388,7 +4388,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.91", ] [[package]] diff --git a/python/pyo3-geoarrow/src/ffi/from_python/scalar.rs b/python/pyo3-geoarrow/src/ffi/from_python/scalar.rs index 3890aa99..778c0a6b 100644 --- a/python/pyo3-geoarrow/src/ffi/from_python/scalar.rs +++ b/python/pyo3-geoarrow/src/ffi/from_python/scalar.rs @@ -2,8 +2,7 @@ use std::sync::Arc; use crate::array::*; use crate::scalar::*; -use geoarrow::datatypes::Dimension; -use geoarrow::io::geozero::ToMixedArray; +use geoarrow::io::geozero::ToGeometryArray; use geoarrow::scalar::GeometryScalar; use geozero::geojson::GeoJsonString; use pyo3::exceptions::PyValueError; @@ -24,9 +23,9 @@ impl<'a> FromPyObject<'a> for PyGeometry { // Parse GeoJSON to geometry scalar let reader = GeoJsonString(json_string); - // TODO: we need a dynamic dimensionality reader + // TODO: use ToGeometry directly in the future? let arr = reader - .to_mixed_geometry_array(Dimension::XY) + .to_geometry_array() .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(Self( GeometryScalar::try_new(Arc::new(arr)) diff --git a/rust/geoarrow/src/array/geometry/builder.rs b/rust/geoarrow/src/array/geometry/builder.rs index 86302186..80408817 100644 --- a/rust/geoarrow/src/array/geometry/builder.rs +++ b/rust/geoarrow/src/array/geometry/builder.rs @@ -426,7 +426,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_point_type(&mut self, dim: Dimension) { + fn add_point_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets.push(self.point_xy.len().try_into().unwrap()); @@ -508,7 +508,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_line_string_type(&mut self, dim: Dimension) { + fn add_line_string_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets @@ -589,7 +589,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_polygon_type(&mut self, dim: Dimension) { + fn add_polygon_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets.push(self.polygon_xy.len().try_into().unwrap()); @@ -644,7 +644,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_multi_point_type(&mut self, dim: Dimension) { + fn add_multi_point_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets.push(self.mpoint_xy.len().try_into().unwrap()); @@ -700,7 +700,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_multi_line_string_type(&mut self, dim: Dimension) { + fn add_multi_line_string_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets @@ -756,7 +756,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_multi_polygon_type(&mut self, dim: Dimension) { + fn add_multi_polygon_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets @@ -846,7 +846,7 @@ impl<'a> GeometryBuilder { } #[inline] - pub(crate) fn add_geometry_collection_type(&mut self, dim: Dimension) { + fn add_geometry_collection_type(&mut self, dim: Dimension) { match dim { Dimension::XY => { self.offsets.push(self.gc_xy.len().try_into().unwrap()); diff --git a/rust/geoarrow/src/array/mixed/builder.rs b/rust/geoarrow/src/array/mixed/builder.rs index 9e1cbe5d..c54f5510 100644 --- a/rust/geoarrow/src/array/mixed/builder.rs +++ b/rust/geoarrow/src/array/mixed/builder.rs @@ -267,7 +267,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_point_type(&mut self) { + fn add_point_type(&mut self) { self.offsets.push(self.points.len().try_into().unwrap()); match self.dim { Dimension::XY => self.types.push(1), @@ -298,7 +298,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_line_string_type(&mut self) { + fn add_line_string_type(&mut self) { self.offsets .push(self.line_strings.len().try_into().unwrap()); match self.dim { @@ -327,7 +327,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_polygon_type(&mut self) { + fn add_polygon_type(&mut self) { self.offsets.push(self.polygons.len().try_into().unwrap()); match self.dim { Dimension::XY => self.types.push(3), @@ -350,7 +350,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_multi_point_type(&mut self) { + fn add_multi_point_type(&mut self) { self.offsets .push(self.multi_points.len().try_into().unwrap()); match self.dim { @@ -374,7 +374,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_multi_line_string_type(&mut self) { + fn add_multi_line_string_type(&mut self) { self.offsets .push(self.multi_line_strings.len().try_into().unwrap()); match self.dim { @@ -398,7 +398,7 @@ impl<'a> MixedGeometryBuilder { } #[inline] - pub(crate) fn add_multi_polygon_type(&mut self) { + fn add_multi_polygon_type(&mut self) { self.offsets .push(self.multi_polygons.len().try_into().unwrap()); match self.dim { diff --git a/rust/geoarrow/src/io/flatgeobuf/reader/async.rs b/rust/geoarrow/src/io/flatgeobuf/reader/async.rs index ac4733d4..9526ec54 100644 --- a/rust/geoarrow/src/io/flatgeobuf/reader/async.rs +++ b/rust/geoarrow/src/io/flatgeobuf/reader/async.rs @@ -12,7 +12,7 @@ use crate::datatypes::Dimension; use crate::error::{GeoArrowError, Result}; use crate::io::flatgeobuf::reader::common::{infer_schema, parse_crs, FlatGeobufReaderOptions}; use crate::io::flatgeobuf::reader::object_store_reader::ObjectStoreWrapper; -use crate::io::geozero::array::MixedGeometryStreamBuilder; +use crate::io::geozero::array::GeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; @@ -94,10 +94,8 @@ pub async fn read_flatgeobuf_async( (GeometryType::MultiLineString, false) => impl_read!(MultiLineStringBuilder, Dimension::XY), (GeometryType::MultiPolygon, false) => impl_read!(MultiPolygonBuilder, Dimension::XY), (GeometryType::Unknown, false) => { - let mut builder = GeoTableBuilder::::new_with_options( - Dimension::XY, - options, - ); + let mut builder = + GeoTableBuilder::::new_with_options(Dimension::XY, options); selection.process_features(&mut builder).await?; let table = builder.finish()?; table.downcast() @@ -117,10 +115,8 @@ pub async fn read_flatgeobuf_async( (GeometryType::MultiLineString, true) => impl_read!(MultiLineStringBuilder, Dimension::XYZ), (GeometryType::MultiPolygon, true) => impl_read!(MultiPolygonBuilder, Dimension::XYZ), (GeometryType::Unknown, true) => { - let mut builder = GeoTableBuilder::::new_with_options( - Dimension::XYZ, - options, - ); + let mut builder = + GeoTableBuilder::::new_with_options(Dimension::XYZ, options); selection.process_features(&mut builder).await?; let table = builder.finish()?; table.downcast() diff --git a/rust/geoarrow/src/io/flatgeobuf/reader/sync.rs b/rust/geoarrow/src/io/flatgeobuf/reader/sync.rs index 37790570..4d08704c 100644 --- a/rust/geoarrow/src/io/flatgeobuf/reader/sync.rs +++ b/rust/geoarrow/src/io/flatgeobuf/reader/sync.rs @@ -24,7 +24,7 @@ use crate::array::*; use crate::datatypes::Dimension; use crate::error::{GeoArrowError, Result}; use crate::io::flatgeobuf::reader::common::{infer_schema, parse_crs, FlatGeobufReaderOptions}; -use crate::io::geozero::array::MixedGeometryStreamBuilder; +use crate::io::geozero::array::GeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; use flatgeobuf::{FallibleStreamingIterator, FgbReader, GeometryType}; @@ -99,10 +99,8 @@ pub fn read_flatgeobuf( (GeometryType::MultiLineString, false) => impl_read!(MultiLineStringBuilder, Dimension::XY), (GeometryType::MultiPolygon, false) => impl_read!(MultiPolygonBuilder, Dimension::XY), (GeometryType::Unknown, false) => { - let mut builder = GeoTableBuilder::::new_with_options( - Dimension::XY, - options, - ); + let mut builder = + GeoTableBuilder::::new_with_options(Dimension::XY, options); selection.process_features(&mut builder)?; let table = builder.finish()?; table.downcast() @@ -122,10 +120,8 @@ pub fn read_flatgeobuf( (GeometryType::MultiLineString, true) => impl_read!(MultiLineStringBuilder, Dimension::XYZ), (GeometryType::MultiPolygon, true) => impl_read!(MultiPolygonBuilder, Dimension::XYZ), (GeometryType::Unknown, true) => { - let mut builder = GeoTableBuilder::::new_with_options( - Dimension::XYZ, - options, - ); + let mut builder = + GeoTableBuilder::::new_with_options(Dimension::XYZ, options); selection.process_features(&mut builder)?; let table = builder.finish()?; // TODO: 3d downcasting not implemented diff --git a/rust/geoarrow/src/io/geojson/reader.rs b/rust/geoarrow/src/io/geojson/reader.rs index 330cd3a6..6c71ef64 100644 --- a/rust/geoarrow/src/io/geojson/reader.rs +++ b/rust/geoarrow/src/io/geojson/reader.rs @@ -5,7 +5,7 @@ use std::io::Read; use crate::array::CoordType; use crate::datatypes::Dimension; use crate::error::Result; -use crate::io::geozero::array::MixedGeometryStreamBuilder; +use crate::io::geozero::array::GeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; @@ -22,7 +22,7 @@ pub fn read_geojson(reader: R, batch_size: Option) -> Result::new_with_options(Dimension::XY, options); + GeoTableBuilder::::new_with_options(Dimension::XY, options); geojson.process(&mut geo_table)?; geo_table.finish() } diff --git a/rust/geoarrow/src/io/geojson_lines/reader.rs b/rust/geoarrow/src/io/geojson_lines/reader.rs index f58ececc..4a5c734a 100644 --- a/rust/geoarrow/src/io/geojson_lines/reader.rs +++ b/rust/geoarrow/src/io/geojson_lines/reader.rs @@ -5,7 +5,7 @@ use std::io::BufRead; use crate::array::CoordType; use crate::datatypes::Dimension; use crate::error::Result; -use crate::io::geozero::array::MixedGeometryStreamBuilder; +use crate::io::geozero::array::GeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; @@ -26,7 +26,7 @@ pub fn read_geojson_lines(reader: R, batch_size: Option) -> R Default::default(), ); let mut geo_table = - GeoTableBuilder::::new_with_options(Dimension::XY, options); + GeoTableBuilder::::new_with_options(Dimension::XY, options); geojson_line_reader.process(&mut geo_table)?; geo_table.finish() } diff --git a/rust/geoarrow/src/io/geozero/array/geometry.rs b/rust/geoarrow/src/io/geozero/array/geometry.rs new file mode 100644 index 00000000..30938316 --- /dev/null +++ b/rust/geoarrow/src/io/geozero/array/geometry.rs @@ -0,0 +1,242 @@ +use std::fmt::Debug; +use std::sync::Arc; + +use geozero::error::GeozeroError; +use geozero::geo_types::GeoWriter; +use geozero::{GeomProcessor, GeozeroGeometry}; + +use crate::array::metadata::ArrayMetadata; +use crate::array::{CoordType, GeometryArray, GeometryBuilder}; +use crate::datatypes::Dimension; +use crate::trait_::GeometryArrayBuilder; +use crate::NativeArray; + +/// GeoZero trait to convert to GeoArrow [`GeometryArray`]. +pub trait ToGeometryArray { + /// Convert to GeoArrow [`GeometryArray`] + fn to_geometry_array(&self) -> geozero::error::Result { + Ok(self.to_geometry_builder()?.into()) + } + + /// Convert to a GeoArrow [`GeometryBuilder`] + fn to_geometry_builder(&self) -> geozero::error::Result; +} + +impl ToGeometryArray for T { + fn to_geometry_builder(&self) -> geozero::error::Result { + let mut stream_builder = GeometryStreamBuilder::new(); + self.process_geom(&mut stream_builder)?; + Ok(stream_builder.builder) + } +} + +/// A streaming builder for GeoArrow [`GeometryArray`]. +/// +/// This is useful in conjunction with [`geozero`] APIs because its coordinate stream requires the +/// consumer to keep track of which geometry type is currently being added to. +/// +/// Converting an [`GeometryStreamBuilder`] into a [`GeometryArray`] is `O(1)`. +// #[derive(Debug)] +pub struct GeometryStreamBuilder { + builder: GeometryBuilder, + current_geometry: GeoWriter, +} + +impl GeometryStreamBuilder { + pub fn new() -> Self { + Self::new_with_options(Default::default(), Default::default(), true) + } + + pub fn new_with_options( + coord_type: CoordType, + metadata: Arc, + prefer_multi: bool, + ) -> Self { + Self { + builder: GeometryBuilder::new_with_options(coord_type, metadata, prefer_multi), + current_geometry: GeoWriter::new(), + } + } + + #[allow(dead_code)] + pub fn push_null(&mut self) { + self.builder.push_null() + } + + pub fn finish(self) -> GeometryArray { + self.builder.finish() + } + + fn push_current_geometry(&mut self) -> geozero::error::Result<()> { + let geom = self + .current_geometry + .take_geometry() + .ok_or(GeozeroError::Geometry("Take geometry failed".to_string()))?; + self.builder + .push_geometry(Some(&geom)) + .map_err(|err| GeozeroError::Geometry(err.to_string()))?; + self.current_geometry = GeoWriter::new(); + Ok(()) + } +} + +impl Default for GeometryStreamBuilder { + fn default() -> Self { + Self::new() + } +} + +impl Debug for GeometryStreamBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GeometryStreamBuilder") + } +} + +#[allow(unused_variables)] +impl GeomProcessor for GeometryStreamBuilder { + fn xy(&mut self, x: f64, y: f64, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.xy(x, y, idx) + } + + fn empty_point(&mut self, idx: usize) -> geozero::error::Result<()> { + // This needs to be separate because GeoWriter doesn't know how to handle empty points + Err(GeozeroError::Geometry( + "Empty points not currently supported in generic geometry builder.".to_string(), + )) + // if self.builder.prefer_multi { + // self.builder.add_multi_point_type(); + // self.builder + // .multi_points + // .push_point(None::<&geo::Point>) + // .unwrap(); + // } else { + // self.builder.add_point_type(); + // self.builder.points.push_empty(); + // } + // Ok(()) + } + + fn point_begin(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.point_begin(idx) + } + + fn point_end(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.point_end(idx)?; + self.push_current_geometry() + } + + fn multipoint_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multipoint_begin(size, idx) + } + + fn multipoint_end(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multipoint_end(idx)?; + self.push_current_geometry() + } + + fn linestring_begin( + &mut self, + tagged: bool, + size: usize, + idx: usize, + ) -> geozero::error::Result<()> { + self.current_geometry.linestring_begin(tagged, size, idx) + } + + fn linestring_end(&mut self, tagged: bool, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.linestring_end(tagged, idx)?; + self.push_current_geometry() + } + + fn multilinestring_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multilinestring_begin(size, idx) + } + + fn multilinestring_end(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multilinestring_end(idx)?; + self.push_current_geometry() + } + + fn polygon_begin( + &mut self, + tagged: bool, + size: usize, + idx: usize, + ) -> geozero::error::Result<()> { + self.current_geometry.polygon_begin(tagged, size, idx) + } + + fn polygon_end(&mut self, tagged: bool, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.polygon_end(tagged, idx)?; + self.push_current_geometry() + } + + fn multipolygon_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multipolygon_begin(size, idx) + } + + fn multipolygon_end(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.multipolygon_end(idx)?; + self.push_current_geometry() + } + + fn geometrycollection_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.geometrycollection_begin(size, idx) + } + + fn geometrycollection_end(&mut self, idx: usize) -> geozero::error::Result<()> { + self.current_geometry.geometrycollection_end(idx)?; + self.push_current_geometry() + } +} + +impl GeometryArrayBuilder for GeometryStreamBuilder { + fn len(&self) -> usize { + self.builder.len() + } + + fn nulls(&self) -> &arrow_buffer::NullBufferBuilder { + // Take this method off trait + todo!() + } + + fn push_geometry( + &mut self, + value: Option<&impl geo_traits::GeometryTrait>, + ) -> crate::error::Result<()> { + self.builder.push_geometry(value) + } + + fn new(dim: Dimension) -> Self { + Self::with_geom_capacity_and_options(dim, 0, Default::default(), Default::default()) + } + + fn into_array_ref(self) -> Arc { + self.builder.into_array_ref() + } + + fn with_geom_capacity_and_options( + _dim: Dimension, + _geom_capacity: usize, + coord_type: CoordType, + metadata: Arc, + ) -> Self { + Self::new_with_options(coord_type, metadata, true) + } + + fn set_metadata(&mut self, metadata: Arc) { + self.builder.set_metadata(metadata) + } + + fn finish(self) -> std::sync::Arc { + Arc::new(self.finish()) + } + + fn coord_type(&self) -> CoordType { + self.builder.coord_type() + } + + fn metadata(&self) -> Arc { + self.builder.metadata() + } +} diff --git a/rust/geoarrow/src/io/geozero/array/mixed.rs b/rust/geoarrow/src/io/geozero/array/mixed.rs deleted file mode 100644 index 29affcdb..00000000 --- a/rust/geoarrow/src/io/geozero/array/mixed.rs +++ /dev/null @@ -1,338 +0,0 @@ -use std::sync::Arc; - -use crate::array::metadata::ArrayMetadata; -use crate::array::{CoordType, GeometryArray, MixedGeometryArray, MixedGeometryBuilder}; -use crate::datatypes::Dimension; -use crate::io::geozero::scalar::process_geometry; -use crate::trait_::{ArrayAccessor, GeometryArrayBuilder}; -use crate::ArrayBase; -use crate::NativeArray; -use geozero::{GeomProcessor, GeozeroGeometry}; - -impl GeozeroGeometry for MixedGeometryArray { - fn process_geom(&self, processor: &mut P) -> geozero::error::Result<()> - where - Self: Sized, - { - let num_geometries = self.len(); - processor.geometrycollection_begin(num_geometries, 0)?; - - for geom_idx in 0..num_geometries { - process_geometry(&self.value(geom_idx), geom_idx, processor)?; - } - - processor.geometrycollection_end(num_geometries - 1)?; - Ok(()) - } -} - -// TODO: Add "promote to multi" here -/// GeoZero trait to convert to GeoArrow MixedArray. -pub trait ToMixedArray { - /// Convert to GeoArrow MixedArray - fn to_mixed_geometry_array(&self, dim: Dimension) - -> geozero::error::Result; - - /// Convert to a GeoArrow MixedArrayBuilder - fn to_mixed_geometry_builder( - &self, - dim: Dimension, - ) -> geozero::error::Result; -} - -impl ToMixedArray for T { - fn to_mixed_geometry_array( - &self, - dim: Dimension, - ) -> geozero::error::Result { - Ok(self.to_mixed_geometry_builder(dim)?.into()) - } - - fn to_mixed_geometry_builder( - &self, - dim: Dimension, - ) -> geozero::error::Result { - let mut stream_builder = MixedGeometryStreamBuilder::new(dim); - self.process_geom(&mut stream_builder)?; - Ok(stream_builder.builder) - } -} - -/// The current geometry type in which we're pushing coordinates. -#[derive(Debug, Clone, Copy, PartialEq)] -enum GeometryType { - Point = 1, - LineString = 2, - Polygon = 3, - MultiPoint = 4, - MultiLineString = 5, - MultiPolygon = 6, -} - -/// A streaming builder for GeoArrow MixedGeometryArray. -/// -/// This is useful in conjunction with [`geozero`] APIs because its coordinate stream requires the -/// consumer to keep track of which geometry type is currently being added to. -/// -/// Converting an [`MixedGeometryStreamBuilder`] into a [`MixedGeometryArray`] is `O(1)`. -#[derive(Debug)] -pub struct MixedGeometryStreamBuilder { - builder: MixedGeometryBuilder, - // Note: we don't know if, when `linestring_end` is called, that means a ring of a polygon has - // finished or if a tagged line string has finished. This means we can't have an "unknown" enum - // type, because we'll never be able to set it to unknown after a line string is done, meaning - // that we can't rely on it being unknown or not. - current_geom_type: GeometryType, -} - -impl MixedGeometryStreamBuilder { - pub fn new(dim: Dimension) -> Self { - Self::new_with_options(dim, Default::default(), Default::default(), true) - } - - pub fn new_with_options( - dim: Dimension, - coord_type: CoordType, - metadata: Arc, - prefer_multi: bool, - ) -> Self { - Self { - builder: MixedGeometryBuilder::new_with_options( - dim, - coord_type, - metadata, - prefer_multi, - ), - current_geom_type: GeometryType::Point, - } - } - - #[allow(dead_code)] - pub fn push_null(&mut self) { - self.builder.push_null() - } - - pub fn finish(self) -> GeometryArray { - // Hack until the MixedGeometryStreamBuilder is updated to build a GeometryBuilder directly - self.builder.finish().into() - } -} - -impl Default for MixedGeometryStreamBuilder { - fn default() -> Self { - Self::new(Dimension::XY) - } -} - -#[allow(unused_variables)] -impl GeomProcessor for MixedGeometryStreamBuilder { - fn xy(&mut self, x: f64, y: f64, idx: usize) -> geozero::error::Result<()> { - match self.current_geom_type { - GeometryType::Point => { - if self.builder.prefer_multi { - self.builder.multi_points.xy(x, y, idx) - } else { - self.builder.points.xy(x, y, idx) - } - } - GeometryType::LineString => { - if self.builder.prefer_multi { - self.builder.multi_line_strings.xy(x, y, idx) - } else { - self.builder.line_strings.xy(x, y, idx) - } - } - GeometryType::Polygon => { - if self.builder.prefer_multi { - self.builder.multi_polygons.xy(x, y, idx) - } else { - self.builder.polygons.xy(x, y, idx) - } - } - GeometryType::MultiPoint => self.builder.multi_points.xy(x, y, idx), - GeometryType::MultiLineString => self.builder.multi_line_strings.xy(x, y, idx), - GeometryType::MultiPolygon => self.builder.multi_polygons.xy(x, y, idx), - } - } - - fn empty_point(&mut self, idx: usize) -> geozero::error::Result<()> { - if self.builder.prefer_multi { - self.builder.add_multi_point_type(); - self.builder - .multi_points - .push_point(None::<&geo::Point>) - .unwrap(); - } else { - self.builder.add_point_type(); - self.builder.points.push_empty(); - } - Ok(()) - } - - /// NOTE: It appears that point_begin is **only** called for "tagged" `Point` geometries. I.e. - /// a coord of another geometry never has `point_begin` called. Each point of a multi point - /// does not have `point_begin` called. - fn point_begin(&mut self, idx: usize) -> geozero::error::Result<()> { - self.current_geom_type = GeometryType::Point; - if self.builder.prefer_multi { - self.builder.add_multi_point_type(); - self.builder.multi_points.point_begin(idx)?; - } else { - self.builder.add_point_type(); - self.builder.points.point_begin(idx)?; - } - Ok(()) - } - - fn multipoint_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { - self.current_geom_type = GeometryType::MultiPoint; - self.builder.add_multi_point_type(); - self.builder.multi_points.multipoint_begin(size, idx) - } - - fn linestring_begin( - &mut self, - tagged: bool, - size: usize, - idx: usize, - ) -> geozero::error::Result<()> { - if tagged { - self.current_geom_type = GeometryType::LineString; - if self.builder.prefer_multi { - self.builder.add_multi_line_string_type(); - } else { - self.builder.add_line_string_type(); - } - }; - - match self.current_geom_type { - GeometryType::LineString => { - if self.builder.prefer_multi { - self.builder - .multi_line_strings - .linestring_begin(tagged, size, idx) - } else { - self.builder - .line_strings - .linestring_begin(tagged, size, idx) - } - } - GeometryType::MultiLineString => self - .builder - .multi_line_strings - .linestring_begin(tagged, size, idx), - GeometryType::Polygon => { - if self.builder.prefer_multi { - self.builder - .multi_polygons - .linestring_begin(tagged, size, idx) - } else { - self.builder.polygons.linestring_begin(tagged, size, idx) - } - } - GeometryType::MultiPolygon => self - .builder - .multi_polygons - .linestring_begin(tagged, size, idx), - _ => panic!( - "unexpected linestring_begin for {:?}", - self.current_geom_type - ), - } - } - - fn multilinestring_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { - self.current_geom_type = GeometryType::MultiLineString; - self.builder.add_multi_line_string_type(); - self.builder - .multi_line_strings - .multilinestring_begin(size, idx) - } - - fn polygon_begin( - &mut self, - tagged: bool, - size: usize, - idx: usize, - ) -> geozero::error::Result<()> { - if tagged { - self.current_geom_type = GeometryType::Polygon; - if self.builder.prefer_multi { - self.builder.add_multi_polygon_type(); - } else { - self.builder.add_polygon_type(); - } - }; - - match self.current_geom_type { - GeometryType::Polygon => { - if self.builder.prefer_multi { - self.builder.multi_polygons.polygon_begin(tagged, size, idx) - } else { - self.builder.polygons.polygon_begin(tagged, size, idx) - } - } - GeometryType::MultiPolygon => { - self.builder.multi_polygons.polygon_begin(tagged, size, idx) - } - _ => panic!("unexpected polygon_begin for {:?}", self.current_geom_type), - } - } - - fn multipolygon_begin(&mut self, size: usize, idx: usize) -> geozero::error::Result<()> { - self.current_geom_type = GeometryType::MultiPolygon; - self.builder.add_multi_polygon_type(); - self.builder.multi_polygons.multipolygon_begin(size, idx) - } -} - -impl GeometryArrayBuilder for MixedGeometryStreamBuilder { - fn len(&self) -> usize { - self.builder.len() - } - - fn nulls(&self) -> &arrow_buffer::NullBufferBuilder { - // Take this method off trait - todo!() - } - - fn push_geometry( - &mut self, - _value: Option<&impl geo_traits::GeometryTrait>, - ) -> crate::error::Result<()> { - todo!() - } - - fn new(dim: Dimension) -> Self { - Self::with_geom_capacity_and_options(dim, 0, Default::default(), Default::default()) - } - - fn into_array_ref(self) -> Arc { - self.builder.into_array_ref() - } - - fn with_geom_capacity_and_options( - dim: Dimension, - _geom_capacity: usize, - coord_type: CoordType, - metadata: Arc, - ) -> Self { - Self::new_with_options(dim, coord_type, metadata, true) - } - - fn set_metadata(&mut self, metadata: Arc) { - self.builder.set_metadata(metadata) - } - - fn finish(self) -> std::sync::Arc { - Arc::new(self.finish()) - } - - fn coord_type(&self) -> CoordType { - self.builder.coord_type() - } - - fn metadata(&self) -> Arc { - self.builder.metadata() - } -} diff --git a/rust/geoarrow/src/io/geozero/array/mod.rs b/rust/geoarrow/src/io/geozero/array/mod.rs index 859aa86d..01a5d525 100644 --- a/rust/geoarrow/src/io/geozero/array/mod.rs +++ b/rust/geoarrow/src/io/geozero/array/mod.rs @@ -1,15 +1,15 @@ mod dynamic; +mod geometry; mod geometrycollection; mod linestring; -mod mixed; mod multilinestring; mod multipoint; mod multipolygon; mod point; mod polygon; +pub use geometry::{GeometryStreamBuilder, ToGeometryArray}; pub use linestring::ToLineStringArray; -pub use mixed::{MixedGeometryStreamBuilder, ToMixedArray}; pub use multilinestring::ToMultiLineStringArray; pub use multipoint::ToMultiPointArray; pub use multipolygon::ToMultiPolygonArray; diff --git a/rust/geoarrow/src/io/geozero/mod.rs b/rust/geoarrow/src/io/geozero/mod.rs index c573066e..6d6d5f92 100644 --- a/rust/geoarrow/src/io/geozero/mod.rs +++ b/rust/geoarrow/src/io/geozero/mod.rs @@ -6,7 +6,7 @@ mod scalar; pub(crate) mod table; pub use array::{ - ToLineStringArray, ToMixedArray, ToMultiLineStringArray, ToMultiPointArray, + ToGeometryArray, ToLineStringArray, ToMultiLineStringArray, ToMultiPointArray, ToMultiPolygonArray, ToPointArray, ToPolygonArray, }; pub use scalar::ToGeometry; diff --git a/rust/geoarrow/src/io/geozero/scalar/geometry.rs b/rust/geoarrow/src/io/geozero/scalar/geometry.rs index a34f89df..12a6c785 100644 --- a/rust/geoarrow/src/io/geozero/scalar/geometry.rs +++ b/rust/geoarrow/src/io/geozero/scalar/geometry.rs @@ -1,4 +1,3 @@ -use crate::datatypes::Dimension; use crate::io::geozero::scalar::geometry_collection::process_geometry_collection; use crate::io::geozero::scalar::linestring::process_line_string; use crate::io::geozero::scalar::multilinestring::process_multi_line_string; @@ -6,7 +5,7 @@ use crate::io::geozero::scalar::multipoint::process_multi_point; use crate::io::geozero::scalar::multipolygon::process_multi_polygon; use crate::io::geozero::scalar::point::process_point; use crate::io::geozero::scalar::polygon::process_polygon; -use crate::io::geozero::ToMixedArray; +use crate::io::geozero::ToGeometryArray; use crate::scalar::{Geometry, OwnedGeometry}; use crate::trait_::ArrayAccessor; use crate::ArrayBase; @@ -48,12 +47,12 @@ impl GeozeroGeometry for Geometry<'_> { /// Convert a geozero scalar data source to an [OwnedGeometry]. pub trait ToGeometry { /// Convert a geozero scalar data source to an [OwnedGeometry]. - fn to_geometry(&self, dim: Dimension) -> geozero::error::Result; + fn to_geometry(&self) -> geozero::error::Result; } impl ToGeometry for T { - fn to_geometry(&self, dim: Dimension) -> geozero::error::Result { - let arr = self.to_mixed_geometry_array(dim)?; + fn to_geometry(&self) -> geozero::error::Result { + let arr = self.to_geometry_array()?; assert_eq!(arr.len(), 1); Ok(OwnedGeometry::from(arr.value(0))) } diff --git a/rust/geoarrow/src/io/postgis/reader.rs b/rust/geoarrow/src/io/postgis/reader.rs index d04634e2..01a19329 100644 --- a/rust/geoarrow/src/io/postgis/reader.rs +++ b/rust/geoarrow/src/io/postgis/reader.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use crate::datatypes::Dimension; use crate::error::{GeoArrowError, Result}; -use crate::io::geozero::array::MixedGeometryStreamBuilder; +use crate::io::geozero::array::GeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; use crate::trait_::GeometryArrayBuilder; @@ -174,7 +174,7 @@ pub async fn read_postgis<'c, E: Executor<'c, Database = Postgres>>( ) -> Result> { let query = sqlx::query::(sql); let mut result_stream = query.fetch(executor); - let mut table_builder: Option> = None; + let mut table_builder: Option> = None; // TODO: try out chunking with `result_stream.try_chunks` let mut row_idx = 0;