diff --git a/Cargo.toml b/Cargo.toml index 10df9483..3954e31f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -155,7 +155,11 @@ features = [ "string_pad", "replace", "cov", - "http" + "http", + "cloud", + "aws", + "gcp", + "azure" ] git = "https://github.com/pola-rs/polars.git" rev = "7686025ac7738607f2d4f6887e9a1313b7c8b1e2" diff --git a/polars/io.ts b/polars/io.ts index f25af201..7337a8d2 100644 --- a/polars/io.ts +++ b/polars/io.ts @@ -5,6 +5,7 @@ import { isPath } from "./utils"; import { type LazyDataFrame, _LazyDataFrame } from "./lazy/dataframe"; import { type Readable, Stream } from "stream"; import { concat } from "./functions"; +import type { ScanParquetOptions, RowCount } from "./types"; export interface ReadCsvOptions { inferSchemaLength: number | null; @@ -31,7 +32,7 @@ export interface ReadCsvOptions { skipRows: number; tryParseDates: boolean; skipRowsAfterHeader: number; - rowCount: any; + rowCount: RowCount; raiseIfEmpty: boolean; truncateRaggedLines: boolean; missingIsNull: boolean; @@ -470,23 +471,6 @@ export function readAvro(pathOrBody, options = {}) { throw new Error("must supply either a path or body"); } -interface RowCount { - name: string; - offset: string; -} - -interface ScanParquetOptions { - nRows?: number; - cache?: boolean; - parallel?: "auto" | "columns" | "row_groups" | "none"; - rowCount?: RowCount; - rechunk?: boolean; - lowMemory?: boolean; - useStatistics?: boolean; - cloudOptions?: Map; - retries?: number; -} - /** * Lazily read from a local or cloud-hosted parquet file (or files). @@ -503,6 +487,10 @@ interface ScanParquetOptions { This determines the direction of parallelism. 'auto' will try to determine the optimal direction. @param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading. @param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads. + @param options.glob - Expand path given via globbing rules. + @param options.hiveSchema - The column names and data types of the columns by which the data is partitioned. + If set to `None` (default), the schema of the Hive partitions is inferred. + @param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types. @param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks. @param options.lowMemory - Reduce memory pressure at the expense of performance. @param options.cache - Cache the result after reading. @@ -518,6 +506,7 @@ interface ScanParquetOptions { If `storage_options` is not provided, Polars will try to infer the information from environment variables. @param retries - Number of retries if accessing a cloud instance fails. + @param includeFilePaths - Include the path of the source file(s) as a column with this name. */ export function scanParquet(source: string, options: ScanParquetOptions = {}) { const defaultOptions = { parallel: "auto" }; diff --git a/polars/types.ts b/polars/types.ts index c83fd31d..444c67e7 100644 --- a/polars/types.ts +++ b/polars/types.ts @@ -141,12 +141,21 @@ export interface ReadParquetOptions { * Options for {@link scanParquet} */ export interface ScanParquetOptions { - columns?: string[] | number[]; - numRows?: number; - parallel?: "auto" | "columns" | "row_groups" | "none"; - rowCount?: RowCount; + nRows?: number; + rowIndexName?: string; + rowIndexOffset?: number; cache?: boolean; + parallel?: "auto" | "columns" | "row_groups" | "none"; + glob?: boolean; + hivePartitioning?: boolean; + hiveSchema?: unknown; + tryParseHiveDates?: boolean; rechunk?: boolean; + lowMemory?: boolean; + useStatistics?: boolean; + cloudOptions?: unknown; + retries?: number; + includeFilePaths?: string; } /** @@ -156,7 +165,7 @@ export interface RowCount { /** name of column */ name: string; /** offset */ - offset: string; + offset: number; } /** diff --git a/src/lazy/dataframe.rs b/src/lazy/dataframe.rs index 05110821..3ecbbf4b 100644 --- a/src/lazy/dataframe.rs +++ b/src/lazy/dataframe.rs @@ -713,22 +713,38 @@ pub fn scan_csv(path: String, options: ScanCsvOptions) -> napi::Result, + pub row_index_name: Option, + pub row_index_offset: Option, pub cache: Option, pub parallel: Wrap, - pub row_count: Option, + pub glob: Option, + pub hive_partitioning: Option, + pub hive_schema: Option>, + pub try_parse_hive_dates: Option, pub rechunk: Option, pub low_memory: Option, pub use_statistics: Option, pub cloud_options: Option>, pub retries: Option, + pub include_file_paths: Option, } #[napi(catch_unwind)] pub fn scan_parquet(path: String, options: ScanParquetOptions) -> napi::Result { let n_rows = options.n_rows.map(|i| i as usize); let cache = options.cache.unwrap_or(true); + let glob = options.glob.unwrap_or(true); let parallel = options.parallel; - let row_index: Option = options.row_count.map(|rc| rc.into()); + + let row_index: Option = if let Some(idn) = options.row_index_name { + Some(RowIndex { + name: idn.into(), + offset: options.row_index_offset.unwrap_or(0) + }) + } else { + None + }; + let rechunk = options.rechunk.unwrap_or(false); let low_memory = options.low_memory.unwrap_or(false); let use_statistics = options.use_statistics.unwrap_or(false); @@ -751,6 +767,16 @@ pub fn scan_parquet(path: String, options: ScanParquetOptions) -> napi::Result napi::Result