// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // Implement Arrow streaming binary format #pragma once #include #include #include #include "arrow/ipc/dictionary.h" // IWYU pragma: export #include "arrow/ipc/message.h" #include "arrow/ipc/options.h" #include "arrow/result.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { class Array; class Buffer; class MemoryManager; class MemoryPool; class RecordBatch; class Schema; class Status; class Table; class Tensor; class SparseTensor; namespace io { class OutputStream; } // namespace io namespace ipc { /// \brief Intermediate data structure with metadata header, and zero /// or more buffers for the message body. struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; struct WriteStats { /// Number of IPC messages written. int64_t num_messages = 0; /// Number of record batches written. int64_t num_record_batches = 0; /// Number of dictionary batches written. /// /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries int64_t num_dictionary_batches = 0; /// Number of dictionary deltas written. int64_t num_dictionary_deltas = 0; /// Number of replaced dictionaries (i.e. where a dictionary batch replaces /// an existing dictionary with an unrelated new dictionary). int64_t num_replaced_dictionaries = 0; /// Total size in bytes of record batches emitted. /// The "raw" size counts the original buffer sizes, while the "serialized" size /// includes padding and (optionally) compression. int64_t total_raw_body_size = 0; int64_t total_serialized_body_size = 0; }; /// \class RecordBatchWriter /// \brief Abstract interface for writing a stream of record batches class ARROW_EXPORT RecordBatchWriter { public: virtual ~RecordBatchWriter(); /// \brief Write a record batch to the stream /// /// \param[in] batch the record batch to write to the stream /// \return Status virtual Status WriteRecordBatch(const RecordBatch& batch) = 0; /// \brief Write a record batch with custom metadata to the stream /// /// \param[in] batch the record batch to write to the stream /// \param[in] custom_metadata the record batch's custom metadata to write to the stream /// \return Status virtual Status WriteRecordBatch( const RecordBatch& batch, const std::shared_ptr& custom_metadata) { return Status::NotImplemented( "Write record batch with custom metadata not implemented"); } /// \brief Write possibly-chunked table by creating sequence of record batches /// \param[in] table table to write /// \return Status Status WriteTable(const Table& table); /// \brief Write Table with a particular chunksize /// \param[in] table table to write /// \param[in] max_chunksize maximum length of table chunks. To indicate /// that no maximum should be enforced, pass -1. /// \return Status virtual Status WriteTable(const Table& table, int64_t max_chunksize); /// \brief Perform any logic necessary to finish the stream /// /// \return Status virtual Status Close() = 0; /// \brief Return current write statistics virtual WriteStats stats() const = 0; }; /// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter /// instances /// /// @{ /// Create a new IPC stream writer from stream sink and schema. User is /// responsible for closing the actual OutputStream. /// /// \param[in] sink output stream to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization /// \return Result> ARROW_EXPORT Result> MakeStreamWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); /// Create a new IPC stream writer from stream sink and schema. User is /// responsible for closing the actual OutputStream. /// /// \param[in] sink output stream to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization /// \return Result> ARROW_EXPORT Result> MakeStreamWriter( std::shared_ptr sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); /// Create a new IPC file writer from stream sink and schema /// /// \param[in] sink output stream to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization, optional /// \param[in] metadata custom metadata for File Footer, optional /// \return Result> ARROW_EXPORT Result> MakeFileWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults(), const std::shared_ptr& metadata = NULLPTR); /// Create a new IPC file writer from stream sink and schema /// /// \param[in] sink output stream to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization, optional /// \param[in] metadata custom metadata for File Footer, optional /// \return Result> ARROW_EXPORT Result> MakeFileWriter( std::shared_ptr sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults(), const std::shared_ptr& metadata = NULLPTR); /// @} ARROW_DEPRECATED("Deprecated in 3.0.0. Use MakeStreamWriter") ARROW_EXPORT Result> NewStreamWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); ARROW_DEPRECATED("Deprecated in 2.0.0. Use MakeFileWriter") ARROW_EXPORT Result> NewFileWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults(), const std::shared_ptr& metadata = NULLPTR); /// \brief Low-level API for writing a record batch (without schema) /// to an OutputStream as encapsulated IPC message. See Arrow format /// documentation for more detail. /// /// \param[in] batch the record batch to write /// \param[in] buffer_start_offset the start offset to use in the buffer metadata, /// generally should be 0 /// \param[in] dst an OutputStream /// \param[out] metadata_length the size of the length-prefixed flatbuffer /// including padding to a 64-byte boundary /// \param[out] body_length the size of the contiguous buffer block plus /// \param[in] options options for serialization /// \return Status ARROW_EXPORT Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, const IpcWriteOptions& options); /// \brief Serialize record batch as encapsulated IPC message in a new buffer /// /// \param[in] batch the record batch /// \param[in] options the IpcWriteOptions to use for serialization /// \return the serialized message ARROW_EXPORT Result> SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options); /// \brief Serialize record batch as encapsulated IPC message in a new buffer /// /// \param[in] batch the record batch /// \param[in] mm a MemoryManager to allocate memory from /// \return the serialized message ARROW_EXPORT Result> SerializeRecordBatch(const RecordBatch& batch, std::shared_ptr mm); /// \brief Write record batch to OutputStream /// /// \param[in] batch the record batch to write /// \param[in] options the IpcWriteOptions to use for serialization /// \param[in] out the OutputStream to write the output to /// \return Status /// /// If writing to pre-allocated memory, you can use /// arrow::ipc::GetRecordBatchSize to compute how much space is required ARROW_EXPORT Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options, io::OutputStream* out); /// \brief Serialize schema as encapsulated IPC message /// /// \param[in] schema the schema to write /// \param[in] pool a MemoryPool to allocate memory from /// \return the serialized schema ARROW_EXPORT Result> SerializeSchema(const Schema& schema, MemoryPool* pool = default_memory_pool()); /// \brief Write multiple record batches to OutputStream, including schema /// \param[in] batches a vector of batches. Must all have same schema /// \param[in] options options for serialization /// \param[out] dst an OutputStream /// \return Status ARROW_EXPORT Status WriteRecordBatchStream(const std::vector>& batches, const IpcWriteOptions& options, io::OutputStream* dst); /// \brief Compute the number of bytes needed to write an IPC payload /// including metadata /// /// \param[in] payload the IPC payload to write /// \param[in] options write options /// \return the size of the complete encapsulated message ARROW_EXPORT int64_t GetPayloadSize(const IpcPayload& payload, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); /// \brief Compute the number of bytes needed to write a record batch including metadata /// /// \param[in] batch the record batch to write /// \param[out] size the size of the complete encapsulated message /// \return Status ARROW_EXPORT Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); /// \brief Compute the number of bytes needed to write a record batch including metadata /// /// \param[in] batch the record batch to write /// \param[in] options options for serialization /// \param[out] size the size of the complete encapsulated message /// \return Status ARROW_EXPORT Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options, int64_t* size); /// \brief Compute the number of bytes needed to write a tensor including metadata /// /// \param[in] tensor the tensor to write /// \param[out] size the size of the complete encapsulated message /// \return Status ARROW_EXPORT Status GetTensorSize(const Tensor& tensor, int64_t* size); /// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory /// allocation /// /// \param[in] tensor the Tensor to write /// \param[in] pool MemoryPool to allocate space for metadata /// \return the resulting Message ARROW_EXPORT Result> GetTensorMessage(const Tensor& tensor, MemoryPool* pool); /// \brief Write arrow::Tensor as a contiguous message. /// /// The metadata and body are written assuming 64-byte alignment. It is the /// user's responsibility to ensure that the OutputStream has been aligned /// to a 64-byte multiple before writing the message. /// /// The message is written out as followed: /// \code /// /// \endcode /// /// \param[in] tensor the Tensor to write /// \param[in] dst the OutputStream to write to /// \param[out] metadata_length the actual metadata length, including padding /// \param[out] body_length the actual message body length /// \return Status ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); /// \brief EXPERIMENTAL: Convert arrow::SparseTensor to a Message with minimal memory /// allocation /// /// The message is written out as followed: /// \code /// /// \endcode /// /// \param[in] sparse_tensor the SparseTensor to write /// \param[in] pool MemoryPool to allocate space for metadata /// \return the resulting Message ARROW_EXPORT Result> GetSparseTensorMessage(const SparseTensor& sparse_tensor, MemoryPool* pool); /// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous message. The metadata, /// sparse index, and body are written assuming 64-byte alignment. It is the /// user's responsibility to ensure that the OutputStream has been aligned /// to a 64-byte multiple before writing the message. /// /// \param[in] sparse_tensor the SparseTensor to write /// \param[in] dst the OutputStream to write to /// \param[out] metadata_length the actual metadata length, including padding /// \param[out] body_length the actual message body length /// \return Status ARROW_EXPORT Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); /// \brief Compute IpcPayload for the given schema /// \param[in] schema the Schema that is being serialized /// \param[in] options options for serialization /// \param[in] mapper object mapping dictionary fields to dictionary ids /// \param[out] out the returned vector of IpcPayloads /// \return Status ARROW_EXPORT Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options, const DictionaryFieldMapper& mapper, IpcPayload* out); /// \brief Compute IpcPayload for a dictionary /// \param[in] id the dictionary id /// \param[in] dictionary the dictionary values /// \param[in] options options for serialization /// \param[out] payload the output IpcPayload /// \return Status ARROW_EXPORT Status GetDictionaryPayload(int64_t id, const std::shared_ptr& dictionary, const IpcWriteOptions& options, IpcPayload* payload); /// \brief Compute IpcPayload for a dictionary /// \param[in] id the dictionary id /// \param[in] is_delta whether the dictionary is a delta dictionary /// \param[in] dictionary the dictionary values /// \param[in] options options for serialization /// \param[out] payload the output IpcPayload /// \return Status ARROW_EXPORT Status GetDictionaryPayload(int64_t id, bool is_delta, const std::shared_ptr& dictionary, const IpcWriteOptions& options, IpcPayload* payload); /// \brief Compute IpcPayload for the given record batch /// \param[in] batch the RecordBatch that is being serialized /// \param[in] options options for serialization /// \param[out] out the returned IpcPayload /// \return Status ARROW_EXPORT Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options, IpcPayload* out); /// \brief Compute IpcPayload for the given record batch and custom metadata /// \param[in] batch the RecordBatch that is being serialized /// \param[in] custom_metadata the custom metadata to be serialized with the record batch /// \param[in] options options for serialization /// \param[out] out the returned IpcPayload /// \return Status ARROW_EXPORT Status GetRecordBatchPayload( const RecordBatch& batch, const std::shared_ptr& custom_metadata, const IpcWriteOptions& options, IpcPayload* out); /// \brief Write an IPC payload to the given stream. /// \param[in] payload the payload to write /// \param[in] options options for serialization /// \param[in] dst The stream to write the payload to. /// \param[out] metadata_length the length of the serialized metadata /// \return Status ARROW_EXPORT Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options, io::OutputStream* dst, int32_t* metadata_length); /// \brief Compute IpcPayload for the given sparse tensor /// \param[in] sparse_tensor the SparseTensor that is being serialized /// \param[in,out] pool for any required temporary memory allocations /// \param[out] out the returned IpcPayload /// \return Status ARROW_EXPORT Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool, IpcPayload* out); namespace internal { // These internal APIs may change without warning or deprecation class ARROW_EXPORT IpcPayloadWriter { public: virtual ~IpcPayloadWriter(); // Default implementation is a no-op virtual Status Start(); virtual Status WritePayload(const IpcPayload& payload) = 0; virtual Status Close() = 0; }; /// Create a new IPC payload stream writer from stream sink. User is /// responsible for closing the actual OutputStream. /// /// \param[in] sink output stream to write to /// \param[in] options options for serialization /// \return Result> ARROW_EXPORT Result> MakePayloadStreamWriter( io::OutputStream* sink, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); /// Create a new IPC payload file writer from stream sink. /// /// \param[in] sink output stream to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization, optional /// \param[in] metadata custom metadata for File Footer, optional /// \return Status ARROW_EXPORT Result> MakePayloadFileWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults(), const std::shared_ptr& metadata = NULLPTR); /// Create a new RecordBatchWriter from IpcPayloadWriter and schema. /// /// The format is implicitly the IPC stream format (allowing dictionary /// replacement and deltas). /// /// \param[in] sink the IpcPayloadWriter to write to /// \param[in] schema the schema of the record batches to be written /// \param[in] options options for serialization /// \return Result> ARROW_EXPORT Result> OpenRecordBatchWriter( std::unique_ptr sink, const std::shared_ptr& schema, const IpcWriteOptions& options = IpcWriteOptions::Defaults()); } // namespace internal } // namespace ipc } // namespace arrow