20 #include <cudf/io/detail/parquet.hpp>
57 std::optional<std::vector<std::string>> _columns;
60 std::vector<std::vector<size_type>> _row_groups;
62 int64_t _skip_rows = 0;
64 std::optional<size_type> _num_rows;
67 std::optional<std::reference_wrapper<ast::expression const>> _filter;
70 bool _convert_strings_to_categories =
false;
72 bool _use_pandas_metadata =
true;
76 std::optional<std::vector<reader_column_schema>> _reader_column_schema;
118 return _convert_strings_to_categories;
135 return _reader_column_schema;
151 [[nodiscard]] std::optional<size_type>
const&
get_num_rows()
const {
return _num_rows; }
158 [[nodiscard]]
auto const&
get_columns()
const {
return _columns; }
172 [[nodiscard]]
auto const&
get_filter()
const {
return _filter; }
186 void set_columns(std::vector<std::string> col_names) { _columns = std::move(col_names); }
224 _reader_column_schema = std::move(val);
278 options._columns = std::move(col_names);
314 options._convert_strings_to_categories = val;
326 options._use_pandas_metadata = val;
338 options._reader_column_schema = std::move(val);
374 options._timestamp_type = type;
447 std::size_t chunk_read_limit,
472 std::size_t chunk_read_limit,
473 std::size_t pass_read_limit,
508 std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
533 std::vector<partition_info> _partitions;
535 std::optional<table_input_metadata> _metadata;
537 std::vector<std::map<std::string, std::string>> _user_data;
540 bool _write_timestamps_as_int96 =
false;
543 bool _write_timestamps_as_UTC =
true;
545 std::vector<std::string> _column_chunks_file_paths;
561 std::optional<size_type> _max_page_fragment_size;
563 std::shared_ptr<writer_compression_statistics> _compression_stats;
565 bool _v2_page_headers =
false;
574 : _sink(sink), _table(
table)
638 [[nodiscard]] std::vector<partition_info>
const&
get_partitions()
const {
return _partitions; }
678 return _column_chunks_file_paths;
754 return _compression_stats;
886 _compression_stats = std::move(comp_stats);
918 : options(sink,
table)
939 options._metadata = std::move(
metadata);
950 std::vector<std::map<std::string, std::string>>
metadata);
960 options._stats_level = sf;
1108 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
1110 options._compression_stats = comp_stats;
1122 options._write_timestamps_as_int96 = enabled;
1134 options._write_timestamps_as_UTC = enabled;
1190 std::vector<std::unique_ptr<std::vector<uint8_t>>>
const& metadata_list);
1205 std::optional<table_input_metadata> _metadata;
1207 std::vector<std::map<std::string, std::string>> _user_data;
1210 bool _write_timestamps_as_int96 =
false;
1212 bool _write_timestamps_as_UTC =
true;
1228 std::optional<size_type> _max_page_fragment_size;
1230 std::shared_ptr<writer_compression_statistics> _compression_stats;
1232 bool _v2_page_headers =
false;
1377 return _compression_stats;
1494 _compression_stats = std::move(comp_stats);
1543 options._metadata = std::move(
metadata);
1554 std::vector<std::map<std::string, std::string>>
metadata);
1564 options._stats_level = sf;
1591 options._write_timestamps_as_int96 = enabled;
1603 options._write_timestamps_as_UTC = enabled;
1737 std::shared_ptr<writer_compression_statistics>
const& comp_stats)
1739 options._compression_stats = comp_stats;
1806 std::vector<partition_info>
const& partitions = {});
1816 std::unique_ptr<std::vector<uint8_t>>
close(
1817 std::vector<std::string>
const& column_chunks_file_paths = {});
1820 std::unique_ptr<parquet::detail::writer>
writer;
Indicator for the logical data type of an element in a column.
The chunked parquet reader class to read Parquet file iteratively in to a series of tables,...
table_with_metadata read_chunk() const
Read a chunk of rows in the given Parquet file.
bool has_next() const
Check if there is any data in the given file has not yet read.
chunked_parquet_reader()=default
Default constructor, this should never be used.
chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Constructor for chunked reader.
chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Constructor for chunked reader.
~chunked_parquet_reader()
Destructor, destroying the internal reader instance.
Builds options for chunked_parquet_writer_options.
chunked_parquet_writer_options_builder & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
chunked_parquet_writer_options_builder & stats_level(statistics_freq sf)
Sets the level of statistics in chunked_parquet_writer_options.
chunked_parquet_writer_options_builder & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
chunked_parquet_writer_options && build()
move chunked_parquet_writer_options member once it's is built.
chunked_parquet_writer_options_builder & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
chunked_parquet_writer_options_builder & int96_timestamps(bool enabled)
Set to true if timestamps should be written as int96 types instead of int64 types....
chunked_parquet_writer_options_builder()=default
Default constructor.
chunked_parquet_writer_options_builder & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
chunked_parquet_writer_options_builder & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
chunked_parquet_writer_options_builder & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
chunked_parquet_writer_options_builder & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
chunked_parquet_writer_options_builder & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata in parquet_writer_options.
chunked_parquet_writer_options_builder & metadata(table_input_metadata metadata)
Sets metadata to chunked_parquet_writer_options.
chunked_parquet_writer_options_builder & compression(compression_type compression)
Sets compression type to chunked_parquet_writer_options.
chunked_parquet_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
chunked_parquet_writer_options_builder & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
chunked_parquet_writer_options_builder & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
chunked_parquet_writer_options_builder & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
chunked_parquet_writer_options_builder(sink_info const &sink)
Constructor from sink.
Settings for write_parquet_chunked().
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
auto const & get_metadata() const
Returns metadata information.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
void set_metadata(table_input_metadata metadata)
Sets metadata.
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences.
static chunked_parquet_writer_options_builder builder(sink_info const &sink)
creates builder to build chunked_parquet_writer_options.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata.
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
void set_stats_level(statistics_freq sf)
Sets the level of statistics in parquet_writer_options.
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
auto get_max_page_size_bytes() const
Returns maximum uncompressed page size, in bytes.
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
chunked_parquet_writer_options()=default
Default constructor.
compression_type get_compression() const
Returns compression format used.
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
void set_compression(compression_type compression)
Sets compression type.
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
sink_info const & get_sink() const
Returns sink info.
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
chunked parquet writer class to handle options and write tables in chunks.
parquet_chunked_writer()=default
Default constructor, this should never be used. This is added just to satisfy cython.
std::unique_ptr< std::vector< uint8_t > > close(std::vector< std::string > const &column_chunks_file_paths={})
Finishes the chunked/streamed write process.
std::unique_ptr< parquet::detail::writer > writer
Unique pointer to impl writer class.
parquet_chunked_writer & write(table_view const &table, std::vector< partition_info > const &partitions={})
Writes table to output.
parquet_chunked_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructor with chunked writer options.
Builds parquet_reader_options to use for read_parquet().
parquet_reader_options_builder(source_info src)
Constructor from source info.
parquet_reader_options_builder & skip_rows(int64_t val)
Sets number of rows to skip.
parquet_reader_options_builder & columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
parquet_reader_options_builder & timestamp_type(data_type type)
timestamp_type used to cast timestamp columns.
parquet_reader_options_builder & use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
parquet_reader_options_builder()=default
Default constructor.
parquet_reader_options_builder & row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
parquet_reader_options_builder & set_column_schema(std::vector< reader_column_schema > val)
Sets reader metadata.
parquet_reader_options && build()
move parquet_reader_options member once it's built.
parquet_reader_options_builder & filter(ast::expression const &filter)
Sets vector of individual row groups to read.
parquet_reader_options_builder & num_rows(size_type val)
Sets number of rows to read.
parquet_reader_options_builder & convert_strings_to_categories(bool val)
Sets enable/disable conversion of strings to categories.
Settings for read_parquet().
data_type get_timestamp_type() const
Returns timestamp type used to cast timestamp columns.
parquet_reader_options()=default
Default constructor.
static parquet_reader_options_builder builder(source_info src)
Creates a parquet_reader_options_builder which will build parquet_reader_options.
void set_skip_rows(int64_t val)
Sets number of rows to skip.
void set_columns(std::vector< std::string > col_names)
Sets names of the columns to be read.
void enable_convert_strings_to_categories(bool val)
Sets to enable/disable conversion of strings to categories.
std::optional< std::vector< reader_column_schema > > get_column_schema() const
Returns optional tree of metadata.
source_info const & get_source() const
Returns source info.
auto const & get_row_groups() const
Returns list of individual row groups to be read.
std::optional< size_type > const & get_num_rows() const
Returns number of rows to read.
void set_row_groups(std::vector< std::vector< size_type >> row_groups)
Sets vector of individual row groups to read.
void set_num_rows(size_type val)
Sets number of rows to read.
auto const & get_columns() const
Returns names of column to be read, if set.
void set_timestamp_type(data_type type)
Sets timestamp_type used to cast timestamp columns.
bool is_enabled_convert_strings_to_categories() const
Returns true/false depending on whether strings should be converted to categories or not.
void enable_use_pandas_metadata(bool val)
Sets to enable/disable use of pandas metadata to read.
bool is_enabled_use_pandas_metadata() const
Returns true/false depending whether to use pandas metadata or not while reading.
void set_column_schema(std::vector< reader_column_schema > val)
Sets reader column schema.
void set_filter(ast::expression const &filter)
Sets AST based filter for predicate pushdown.
auto const & get_filter() const
Returns AST based filter for predicate pushdown.
int64_t get_skip_rows() const
Returns number of rows to skip from the start.
Class to build parquet_writer_options.
parquet_writer_options_builder(sink_info const &sink, table_view const &table)
Constructor from sink and table.
parquet_writer_options_builder & metadata(table_input_metadata metadata)
Sets metadata in parquet_writer_options.
parquet_writer_options_builder & dictionary_policy(enum dictionary_policy val)
Sets the policy for dictionary use.
parquet_writer_options_builder & max_page_size_bytes(size_t val)
Sets the maximum uncompressed page size, in bytes.
parquet_writer_options_builder & stats_level(statistics_freq sf)
Sets the level of statistics in parquet_writer_options.
parquet_writer_options_builder & row_group_size_rows(size_type val)
Sets the maximum number of rows in output row groups.
parquet_writer_options_builder & key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets Key-Value footer metadata in parquet_writer_options.
parquet_writer_options_builder()=default
Default constructor.
parquet_writer_options && build()
move parquet_writer_options member once it's built.
parquet_writer_options_builder & row_group_size_bytes(size_t val)
Sets the maximum row group size, in bytes.
parquet_writer_options_builder & max_page_size_rows(size_type val)
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting....
parquet_writer_options_builder & utc_timestamps(bool enabled)
Set to true if timestamps are to be written as UTC.
parquet_writer_options_builder & max_page_fragment_size(size_type val)
Sets the maximum page fragment size, in rows.
parquet_writer_options_builder & compression(compression_type compression)
Sets compression type in parquet_writer_options.
parquet_writer_options_builder & write_v2_headers(bool enabled)
Set to true if V2 page headers are to be written.
parquet_writer_options_builder & partitions(std::vector< partition_info > partitions)
Sets partitions in parquet_writer_options.
parquet_writer_options_builder & max_dictionary_size(size_t val)
Sets the maximum dictionary size, in bytes.
parquet_writer_options_builder & compression_statistics(std::shared_ptr< writer_compression_statistics > const &comp_stats)
Sets the pointer to the output compression statistics.
parquet_writer_options_builder & int96_timestamps(bool enabled)
Sets whether int96 timestamps are written or not in parquet_writer_options.
parquet_writer_options_builder & column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
parquet_writer_options_builder & column_index_truncate_length(int32_t val)
Sets the desired maximum size in bytes for min and max values in the column index.
Settings for write_parquet().
void enable_write_v2_headers(bool val)
Sets preference for V2 page headers. Write V2 page headers if set to true.
void set_partitions(std::vector< partition_info > partitions)
Sets partitions.
statistics_freq get_stats_level() const
Returns level of statistics requested in output file.
static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)
Create builder to create parquet_writer_options.
void set_dictionary_policy(dictionary_policy policy)
Sets the policy for dictionary use.
parquet_writer_options()=default
Default constructor.
auto const & get_metadata() const
Returns associated metadata.
std::vector< std::map< std::string, std::string > > const & get_key_value_metadata() const
Returns Key-Value footer metadata information.
void set_max_dictionary_size(size_t size_bytes)
Sets the maximum dictionary size, in bytes.
auto get_row_group_size_bytes() const
Returns maximum row group size, in bytes.
void set_max_page_fragment_size(size_type size_rows)
Sets the maximum page fragment size, in rows.
compression_type get_compression() const
Returns compression format used.
auto get_max_dictionary_size() const
Returns maximum dictionary size, in bytes.
void set_compression_statistics(std::shared_ptr< writer_compression_statistics > comp_stats)
Sets the pointer to the output compression statistics.
auto get_max_page_size_bytes() const
Returns the maximum uncompressed page size, in bytes.
bool is_enabled_int96_timestamps() const
Returns true if timestamps will be written as INT96.
sink_info const & get_sink() const
Returns sink info.
void set_compression(compression_type compression)
Sets compression type.
std::vector< std::string > const & get_column_chunks_file_paths() const
Returns Column chunks file paths to be set in the raw output metadata.
auto get_column_index_truncate_length() const
Returns maximum length of min or max values in column index, in bytes.
void set_max_page_size_rows(size_type size_rows)
Sets the maximum page size, in rows.
auto is_enabled_utc_timestamps() const
Returns true if timestamps will be written as UTC.
auto get_max_page_fragment_size() const
Returns maximum page fragment size, in rows.
void set_row_group_size_bytes(size_t size_bytes)
Sets the maximum row group size, in bytes.
void enable_utc_timestamps(bool val)
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to true.
void set_max_page_size_bytes(size_t size_bytes)
Sets the maximum uncompressed page size, in bytes.
std::shared_ptr< writer_compression_statistics > get_compression_statistics() const
Returns a shared pointer to the user-provided compression statistics.
void set_stats_level(statistics_freq sf)
Sets the level of statistics.
dictionary_policy get_dictionary_policy() const
Returns policy for dictionary use.
auto get_row_group_size_rows() const
Returns maximum row group size, in rows.
table_view get_table() const
Returns table_view.
void set_column_chunks_file_paths(std::vector< std::string > file_paths)
Sets column chunks file path to be set in the raw output metadata.
void enable_int96_timestamps(bool req)
Sets timestamp writing preferences. INT96 timestamps will be written if true and TIMESTAMP_MICROS wil...
auto is_enabled_write_v2_headers() const
Returns true if V2 page headers should be written.
void set_row_group_size_rows(size_type size_rows)
Sets the maximum row group size, in rows.
void set_key_value_metadata(std::vector< std::map< std::string, std::string >> metadata)
Sets metadata.
auto get_max_page_size_rows() const
Returns maximum page size, in rows.
void set_metadata(table_input_metadata metadata)
Sets metadata.
void set_column_index_truncate_length(int32_t size_bytes)
Sets the maximum length of min or max values in column index, in bytes.
static parquet_writer_options_builder builder()
Create builder to create parquet_writer_options.
std::vector< partition_info > const & get_partitions() const
Returns partitions.
A set of cudf::column_view's of the same size.
A set of cudf::column's of the same size.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
constexpr size_type default_row_group_size_rows
1 million rows per row group
constexpr int32_t default_column_index_truncate_length
truncate to 64 bytes
constexpr size_t default_row_group_size_bytes
128MB per row group
constexpr size_type default_max_page_fragment_size
5000 rows per page fragment
table_with_metadata read_parquet(parquet_reader_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Reads a Parquet dataset into a set of columns.
constexpr size_t default_max_dictionary_size
1MB dictionary size
constexpr size_t default_max_page_size_bytes
512KB per page
constexpr size_type default_max_page_size_rows
20k rows per page
compression_type
Compression algorithms.
statistics_freq
Column statistics granularity type for parquet/orc writers.
dictionary_policy
Control use of dictionary encoding for parquet writer.
@ SNAPPY
Snappy format, using byte-oriented LZ77.
@ AUTO
Automatically detect or select compression format.
@ STATISTICS_ROWGROUP
Per-Rowgroup column statistics.
@ ALWAYS
Use dictionary regardless of impact on compression.
std::unique_ptr< std::vector< uint8_t > > merge_row_group_metadata(std::vector< std::unique_ptr< std::vector< uint8_t >>> const &metadata_list)
Merges multiple raw metadata blobs that were previously created by write_parquet into a single metada...
std::unique_ptr< std::vector< uint8_t > > write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream=cudf::get_default_stream())
Writes a set of columns to parquet format.
device_memory_resource * get_current_device_resource()
int32_t size_type
Row index type for columns and tables.
@ EMPTY
Always null with no underlying data.
cuDF-IO API type definitions
A generic expression that can be evaluated to return a value.
Destination information for write interfaces.
Source information for read interfaces.
Class definitions for (mutable)_table_view
Type declarations for libcudf.