# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # distutils: language = c++ from libcpp.unordered_map cimport unordered_map from libcpp cimport bool as c_bool from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_fs cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CRecordBatchIterator "arrow::RecordBatchIterator"( CIterator[shared_ptr[CRecordBatch]]): pass cdef extern from "arrow/dataset/plan.h" namespace "arrow::dataset::internal" nogil: cdef void Initialize() ctypedef CStatus cb_writer_finish_internal(CFileWriter*) ctypedef void cb_writer_finish(dict, CFileWriter*) cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior": ExistingDataBehavior_DELETE_MATCHING" \ arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions" ExistingDataBehavior_OVERWRITE_OR_IGNORE" \ arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore" ExistingDataBehavior_ERROR" \ arrow::dataset::ExistingDataBehavior::kError" cdef cppclass CScanOptions "arrow::dataset::ScanOptions": shared_ptr[CSchema] dataset_schema shared_ptr[CSchema] projected_schema c_bool use_threads cdef cppclass CScanNodeOptions "arrow::dataset::ScanNodeOptions"(CExecNodeOptions): CScanNodeOptions(shared_ptr[CDataset] dataset, shared_ptr[CScanOptions] scan_options) shared_ptr[CScanOptions] scan_options cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions": c_string type_name() const ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \ "arrow::dataset::ScanTaskIterator" cdef cppclass CScanTask" arrow::dataset::ScanTask": CResult[CRecordBatchIterator] Execute() cdef cppclass CFragment "arrow::dataset::Fragment": CResult[shared_ptr[CSchema]] ReadPhysicalSchema() CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options) c_bool splittable() const c_string type_name() const const CExpression& partition_expression() const ctypedef vector[shared_ptr[CFragment]] CFragmentVector \ "arrow::dataset::FragmentVector" ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \ "arrow::dataset::FragmentIterator" cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"( CFragment): CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches, CExpression partition_expression) cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch": shared_ptr[CRecordBatch] record_batch shared_ptr[CFragment] fragment ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \ "arrow::dataset::TaggedRecordBatchIterator" cdef cppclass CScanner "arrow::dataset::Scanner": CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions]) CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions]) CResult[CScanTaskIterator] Scan() CResult[CTaggedRecordBatchIterator] ScanBatches() CResult[shared_ptr[CTable]] ToTable() CResult[shared_ptr[CTable]] TakeRows(const CArray& indices) CResult[shared_ptr[CTable]] Head(int64_t num_rows) CResult[int64_t] CountRows() CResult[CFragmentIterator] GetFragments() CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader() const shared_ptr[CScanOptions]& options() cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder": CScannerBuilder(shared_ptr[CDataset], shared_ptr[CScanOptions] scan_options) CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment], shared_ptr[CScanOptions] scan_options) @staticmethod shared_ptr[CScannerBuilder] FromRecordBatchReader( shared_ptr[CRecordBatchReader] reader) CStatus ProjectColumns "Project"(const vector[c_string]& columns) CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns) CStatus Filter(CExpression filter) CStatus UseThreads(c_bool use_threads) CStatus Pool(CMemoryPool* pool) CStatus BatchSize(int64_t batch_size) CStatus FragmentScanOptions( shared_ptr[CFragmentScanOptions] fragment_scan_options) CResult[shared_ptr[CScanner]] Finish() shared_ptr[CSchema] schema() const ctypedef vector[shared_ptr[CDataset]] CDatasetVector \ "arrow::dataset::DatasetVector" cdef cppclass CDataset "arrow::dataset::Dataset": const shared_ptr[CSchema] & schema() CResult[CFragmentIterator] GetFragments() CResult[CFragmentIterator] GetFragments(CExpression predicate) const CExpression & partition_expression() c_string type_name() CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) CResult[shared_ptr[CScannerBuilder]] NewScan() cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"( CDataset): CInMemoryDataset(shared_ptr[CRecordBatchReader]) CInMemoryDataset(shared_ptr[CTable]) cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"( CDataset): @staticmethod CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema, CDatasetVector children) const CDatasetVector& children() const cdef cppclass CInspectOptions "arrow::dataset::InspectOptions": int fragments cdef cppclass CFinishOptions "arrow::dataset::FinishOptions": shared_ptr[CSchema] schema CInspectOptions inspect_options c_bool validate_fragments cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory": CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions) CResult[shared_ptr[CSchema]] Inspect(CInspectOptions) CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"( const shared_ptr[CSchema]& schema) CResult[shared_ptr[CDataset]] Finish() const CExpression& root_partition() CStatus SetRootPartition(CExpression partition) cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory": @staticmethod CResult[shared_ptr[CDatasetFactory]] Make( vector[shared_ptr[CDatasetFactory]] factories) cdef cppclass CFileSource "arrow::dataset::FileSource": const c_string& path() const const shared_ptr[CFileSystem]& filesystem() const const shared_ptr[CBuffer]& buffer() const # HACK: Cython can't handle all the overloads so don't declare them. # This means invalid construction of CFileSource won't be caught in # the C++ generation phase (though it will still be caught when # the generated C++ is compiled). CFileSource(...) cdef cppclass CFileWriteOptions \ "arrow::dataset::FileWriteOptions": const shared_ptr[CFileFormat]& format() const c_string type_name() const cdef cppclass CFileWriter \ "arrow::dataset::FileWriter": const shared_ptr[CFileFormat]& format() const const shared_ptr[CSchema]& schema() const const shared_ptr[CFileWriteOptions]& options() const const CFileLocator& destination() const cdef cppclass CFileFormat "arrow::dataset::FileFormat": shared_ptr[CFragmentScanOptions] default_fragment_scan_options c_string type_name() const CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const CResult[shared_ptr[CFileFragment]] MakeFragment( CFileSource source, CExpression partition_expression, shared_ptr[CSchema] physical_schema) shared_ptr[CFileWriteOptions] DefaultWriteOptions() cdef cppclass CFileFragment "arrow::dataset::FileFragment"( CFragment): const CFileSource& source() const const shared_ptr[CFileFormat]& format() const cdef cppclass CFileSystemDatasetWriteOptions \ "arrow::dataset::FileSystemDatasetWriteOptions": shared_ptr[CFileWriteOptions] file_write_options shared_ptr[CFileSystem] filesystem c_string base_dir shared_ptr[CPartitioning] partitioning int max_partitions c_string basename_template function[cb_writer_finish_internal] writer_pre_finish function[cb_writer_finish_internal] writer_post_finish ExistingDataBehavior existing_data_behavior c_bool create_dir uint32_t max_open_files uint64_t max_rows_per_file uint64_t min_rows_per_group uint64_t max_rows_per_group cdef cppclass CFileSystemDataset \ "arrow::dataset::FileSystemDataset"(CDataset): @staticmethod CResult[shared_ptr[CDataset]] Make( shared_ptr[CSchema] schema, CExpression source_partition, shared_ptr[CFileFormat] format, shared_ptr[CFileSystem] filesystem, vector[shared_ptr[CFileFragment]] fragments) @staticmethod CStatus Write( const CFileSystemDatasetWriteOptions& write_options, shared_ptr[CScanner] scanner) c_string type() vector[c_string] files() const shared_ptr[CFileFormat]& format() const const shared_ptr[CFileSystem]& filesystem() const const shared_ptr[CPartitioning]& partitioning() const cdef cppclass CIpcFileWriteOptions \ "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): pass cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( CFileFormat): pass cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"( CFileFormat): pass cdef cppclass CCsvFileWriteOptions \ "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions): shared_ptr[CCSVWriteOptions] write_options CMemoryPool* pool cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"( CFileFormat): CCSVParseOptions parse_options cdef cppclass CCsvFragmentScanOptions \ "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions): CCSVConvertOptions convert_options CCSVReadOptions read_options cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const CResult[CExpression] Parse(const c_string & path) const const shared_ptr[CSchema] & schema() cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": pass CSegmentEncoding CSegmentEncodingNone\ " arrow::dataset::SegmentEncoding::None" CSegmentEncoding CSegmentEncodingUri\ " arrow::dataset::SegmentEncoding::Uri" cdef cppclass CKeyValuePartitioningOptions \ "arrow::dataset::KeyValuePartitioningOptions": CSegmentEncoding segment_encoding cdef cppclass CHivePartitioningOptions \ "arrow::dataset::HivePartitioningOptions": CSegmentEncoding segment_encoding c_string null_fallback cdef cppclass CPartitioningFactoryOptions \ "arrow::dataset::PartitioningFactoryOptions": c_bool infer_dictionary shared_ptr[CSchema] schema CSegmentEncoding segment_encoding cdef cppclass CHivePartitioningFactoryOptions \ "arrow::dataset::HivePartitioningFactoryOptions": c_bool infer_dictionary c_string null_fallback shared_ptr[CSchema] schema CSegmentEncoding segment_encoding cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": c_string type_name() const cdef cppclass CKeyValuePartitioning \ "arrow::dataset::KeyValuePartitioning"(CPartitioning): CKeyValuePartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries, CKeyValuePartitioningOptions options) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CDirectoryPartitioning \ "arrow::dataset::DirectoryPartitioning"(CPartitioning): CDirectoryPartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries) @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( vector[c_string] field_names, CPartitioningFactoryOptions) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CHivePartitioning \ "arrow::dataset::HivePartitioning"(CPartitioning): CHivePartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries, CHivePartitioningOptions options) @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( CHivePartitioningFactoryOptions) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CFilenamePartitioning \ "arrow::dataset::FilenamePartitioning"(CPartitioning): CFilenamePartitioning(shared_ptr[CSchema] schema, vector[shared_ptr[CArray]] dictionaries) @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( vector[c_string] field_names, CPartitioningFactoryOptions) vector[shared_ptr[CArray]] dictionaries() const cdef cppclass CPartitioningOrFactory \ "arrow::dataset::PartitioningOrFactory": CPartitioningOrFactory(shared_ptr[CPartitioning]) CPartitioningOrFactory(shared_ptr[CPartitioningFactory]) CPartitioningOrFactory & operator = (shared_ptr[CPartitioning]) CPartitioningOrFactory & operator = ( shared_ptr[CPartitioningFactory]) shared_ptr[CPartitioning] partitioning() const shared_ptr[CPartitioningFactory] factory() const cdef cppclass CFileSystemFactoryOptions \ "arrow::dataset::FileSystemFactoryOptions": CPartitioningOrFactory partitioning c_string partition_base_dir c_bool exclude_invalid_files vector[c_string] selector_ignore_prefixes cdef cppclass CFileSystemDatasetFactory \ "arrow::dataset::FileSystemDatasetFactory"( CDatasetFactory): @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"( shared_ptr[CFileSystem] filesystem, vector[c_string] paths, shared_ptr[CFileFormat] format, CFileSystemFactoryOptions options ) @staticmethod CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"( shared_ptr[CFileSystem] filesystem, CFileSelector, shared_ptr[CFileFormat] format, CFileSystemFactoryOptions options )