// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "arrow/array.h" #include "arrow/compute/exec/expression.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/discovery.h" #include "arrow/dataset/file_base.h" #include "arrow/filesystem/localfs.h" #include "arrow/filesystem/mockfs.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" #include "arrow/record_batch.h" #include "arrow/table.h" #include "arrow/testing/future_util.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/util/async_generator.h" #include "arrow/util/io_util.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" #include "arrow/util/thread_pool.h" namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; using internal::TemporaryDir; namespace dataset { using compute::call; using compute::field_ref; using compute::literal; using compute::and_; using compute::equal; using compute::greater; using compute::greater_equal; using compute::is_null; using compute::is_valid; using compute::less; using compute::less_equal; using compute::not_; using compute::not_equal; using compute::or_; using compute::project; using fs::internal::GetAbstractPathExtension; class FileSourceFixtureMixin : public ::testing::Test { public: std::unique_ptr GetSource(std::shared_ptr buffer) { return ::arrow::internal::make_unique(std::move(buffer)); } }; template class GeneratedRecordBatch : public RecordBatchReader { public: GeneratedRecordBatch(std::shared_ptr schema, Gen gen) : schema_(std::move(schema)), gen_(gen) {} std::shared_ptr schema() const override { return schema_; } Status ReadNext(std::shared_ptr* batch) override { return gen_(batch); } private: std::shared_ptr schema_; Gen gen_; }; template std::unique_ptr> MakeGeneratedRecordBatch( std::shared_ptr schema, Gen&& gen) { return ::arrow::internal::make_unique>( schema, std::forward(gen)); } std::unique_ptr MakeGeneratedRecordBatch( std::shared_ptr schema, int64_t batch_size, int64_t batch_repetitions) { auto batch = random::GenerateBatch(schema->fields(), batch_size, /*seed=*/0); int64_t i = 0; return MakeGeneratedRecordBatch( schema, [batch, i, batch_repetitions](std::shared_ptr* out) mutable { *out = i++ < batch_repetitions ? batch : nullptr; return Status::OK(); }); } void EnsureRecordBatchReaderDrained(RecordBatchReader* reader) { ASSERT_OK_AND_ASSIGN(auto batch, reader->Next()); EXPECT_EQ(batch, nullptr); } class DatasetFixtureMixin : public ::testing::Test { public: /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by the data fragment. void AssertScanTaskEquals(RecordBatchReader* expected, RecordBatchGenerator batch_gen, bool ensure_drained = true) { ASSERT_FINISHES_OK(VisitAsyncGenerator( batch_gen, [expected](std::shared_ptr rhs) -> Status { std::shared_ptr lhs; RETURN_NOT_OK(expected->ReadNext(&lhs)); EXPECT_NE(lhs, nullptr); AssertBatchesEqual(*lhs, *rhs); return Status::OK(); })); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Assert the value of the next batch yielded by the reader void AssertBatchEquals(RecordBatchReader* expected, const RecordBatch& batch) { std::shared_ptr lhs; ASSERT_OK(expected->ReadNext(&lhs)); EXPECT_NE(lhs, nullptr); AssertBatchesEqual(*lhs, batch); } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by the data fragment. void AssertFragmentEquals(RecordBatchReader* expected, Fragment* fragment, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto batch_gen, fragment->ScanBatchesAsync(options_)); AssertScanTaskEquals(expected, batch_gen); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by the data fragments of a dataset. void AssertDatasetFragmentsEqual(RecordBatchReader* expected, Dataset* dataset, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto predicate, options_->filter.Bind(*dataset->schema())); ASSERT_OK_AND_ASSIGN(auto it, dataset->GetFragments(predicate)); ARROW_EXPECT_OK(it.Visit([&](std::shared_ptr fragment) -> Status { AssertFragmentEquals(expected, fragment.get(), false); return Status::OK(); })); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by a scanner. void AssertScannerEquals(RecordBatchReader* expected, Scanner* scanner, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto it, scanner->ScanBatches()); ARROW_EXPECT_OK(it.Visit([&](TaggedRecordBatch batch) -> Status { std::shared_ptr lhs; RETURN_NOT_OK(expected->ReadNext(&lhs)); EXPECT_NE(lhs, nullptr); AssertBatchesEqual(*lhs, *batch.record_batch); return Status::OK(); })); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by a scanner. void AssertScanBatchesEquals(RecordBatchReader* expected, Scanner* scanner, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto it, scanner->ScanBatches()); ARROW_EXPECT_OK(it.Visit([&](TaggedRecordBatch batch) -> Status { AssertBatchEquals(expected, *batch.record_batch); return Status::OK(); })); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by a scanner. void AssertScanBatchesUnorderedEquals(RecordBatchReader* expected, Scanner* scanner, int expected_batches_per_fragment, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto it, scanner->ScanBatchesUnordered()); // ToVector does not work since EnumeratedRecordBatch is not comparable std::vector batches; for (;;) { ASSERT_OK_AND_ASSIGN(auto batch, it.Next()); if (IsIterationEnd(batch)) break; batches.push_back(std::move(batch)); } std::sort(batches.begin(), batches.end(), [](const EnumeratedRecordBatch& left, const EnumeratedRecordBatch& right) -> bool { if (left.fragment.index < right.fragment.index) { return true; } if (left.fragment.index > right.fragment.index) { return false; } return left.record_batch.index < right.record_batch.index; }); int fragment_counter = 0; bool saw_last_fragment = false; int batch_counter = 0; for (const auto& batch : batches) { if (batch_counter == 0) { EXPECT_FALSE(saw_last_fragment); } EXPECT_EQ(batch_counter++, batch.record_batch.index); auto last_batch = batch_counter == expected_batches_per_fragment; EXPECT_EQ(last_batch, batch.record_batch.last); EXPECT_EQ(fragment_counter, batch.fragment.index); if (last_batch) { fragment_counter++; batch_counter = 0; } saw_last_fragment = batch.fragment.last; AssertBatchEquals(expected, *batch.record_batch.value); } if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by a dataset. void AssertDatasetEquals(RecordBatchReader* expected, Dataset* dataset, bool ensure_drained = true) { ASSERT_OK_AND_ASSIGN(auto builder, dataset->NewScan()); ASSERT_OK_AND_ASSIGN(auto scanner, builder->Finish()); AssertScannerEquals(expected, scanner.get()); if (ensure_drained) { EnsureRecordBatchReaderDrained(expected); } } protected: void SetSchema(std::vector> fields) { schema_ = schema(std::move(fields)); options_ = std::make_shared(); options_->dataset_schema = schema_; ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames(schema_->field_names(), *schema_)); SetProjection(options_.get(), std::move(projection)); SetFilter(literal(true)); } void SetFilter(compute::Expression filter) { ASSERT_OK_AND_ASSIGN(options_->filter, filter.Bind(*schema_)); } void SetProjectedColumns(std::vector column_names) { ASSERT_OK_AND_ASSIGN( auto projection, ProjectionDescr::FromNames(std::move(column_names), *options_->dataset_schema)); SetProjection(options_.get(), std::move(projection)); } std::shared_ptr schema_; std::shared_ptr options_; }; template class DatasetFixtureMixinWithParam : public DatasetFixtureMixin, public ::testing::WithParamInterface

{}; struct TestFormatParams { bool use_threads; int num_batches; int items_per_batch; int64_t expected_rows() const { return num_batches * items_per_batch; } std::string ToString() const { // GTest requires this to be alphanumeric std::stringstream ss; ss << (use_threads ? "Threaded" : "Serial") << num_batches << "b" << items_per_batch << "r"; return ss.str(); } static std::string ToTestNameString( const ::testing::TestParamInfo& info) { return std::to_string(info.index) + info.param.ToString(); } static std::vector Values() { std::vector values; for (const bool use_threads : std::vector{true, false}) { values.push_back(TestFormatParams{use_threads, 16, 1024}); } return values; } }; std::ostream& operator<<(std::ostream& out, const TestFormatParams& params) { out << params.ToString(); return out; } class FileFormatWriterMixin { virtual std::shared_ptr Write(RecordBatchReader* reader) = 0; virtual std::shared_ptr Write(const Table& table) = 0; }; /// FormatHelper should be a class with these static methods: /// std::shared_ptr Write(RecordBatchReader* reader); /// std::shared_ptr MakeFormat(); template class FileFormatFixtureMixin : public ::testing::Test { public: constexpr static int64_t kBatchSize = 1UL << 12; constexpr static int64_t kBatchRepetitions = 1 << 5; FileFormatFixtureMixin() : format_(FormatHelper::MakeFormat()), opts_(std::make_shared()) {} int64_t expected_batches() const { return kBatchRepetitions; } int64_t expected_rows() const { return kBatchSize * kBatchRepetitions; } std::shared_ptr MakeFragment(const FileSource& source) { EXPECT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source)); return fragment; } std::shared_ptr MakeFragment(const FileSource& source, compute::Expression partition_expression) { EXPECT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source, partition_expression)); return fragment; } std::shared_ptr GetFileSource(RecordBatchReader* reader) { EXPECT_OK_AND_ASSIGN(auto buffer, FormatHelper::Write(reader)); return std::make_shared(std::move(buffer)); } virtual std::shared_ptr GetRecordBatchReader( std::shared_ptr schema) { return MakeGeneratedRecordBatch(schema, kBatchSize, kBatchRepetitions); } Result> GetFileSink() { ARROW_ASSIGN_OR_RAISE(std::shared_ptr buffer, AllocateResizableBuffer(0)); return std::make_shared(buffer); } void SetSchema(std::vector> fields) { opts_->dataset_schema = schema(std::move(fields)); ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::Default(*opts_->dataset_schema)); SetProjection(opts_.get(), std::move(projection)); } void SetFilter(compute::Expression filter) { ASSERT_OK_AND_ASSIGN(opts_->filter, filter.Bind(*opts_->dataset_schema)); } void Project(std::vector names) { ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames( std::move(names), *opts_->dataset_schema)); SetProjection(opts_.get(), std::move(projection)); } void ProjectNested(std::vector names) { std::vector exprs; for (const auto& name : names) { ASSERT_OK_AND_ASSIGN(auto ref, FieldRef::FromDotPath(name)); exprs.push_back(field_ref(ref)); } ASSERT_OK_AND_ASSIGN( auto descr, ProjectionDescr::FromExpressions(std::move(exprs), std::move(names), *opts_->dataset_schema)); SetProjection(opts_.get(), std::move(descr)); } // Shared test cases void AssertInspectFailure(const std::string& contents, StatusCode code, const std::string& format_name) { SCOPED_TRACE("Format: " + format_name + " File contents: " + contents); constexpr auto file_name = "herp/derp"; auto make_error_message = [&](const std::string& filename) { return "Could not open " + format_name + " input source '" + filename + "':"; }; const auto buf = std::make_shared(contents); Status status; status = format_->Inspect(FileSource(buf)).status(); EXPECT_EQ(code, status.code()); EXPECT_THAT(status.ToString(), ::testing::HasSubstr(make_error_message(""))); ASSERT_OK_AND_EQ(false, format_->IsSupported(FileSource(buf))); ASSERT_OK_AND_ASSIGN( auto fs, fs::internal::MockFileSystem::Make(fs::kNoTime, {fs::File(file_name)})); status = format_->Inspect({file_name, fs}).status(); EXPECT_EQ(code, status.code()); EXPECT_THAT(status.ToString(), testing::HasSubstr(make_error_message("herp/derp"))); fs::FileSelector s; s.base_dir = "/"; s.recursive = true; FileSystemFactoryOptions options; ASSERT_OK_AND_ASSIGN(auto factory, FileSystemDatasetFactory::Make(fs, s, format_, options)); status = factory->Finish().status(); EXPECT_EQ(code, status.code()); EXPECT_THAT( status.ToString(), ::testing::AllOf( ::testing::HasSubstr(make_error_message("/herp/derp")), ::testing::HasSubstr( "Error creating dataset. Could not read schema from '/herp/derp':"), ::testing::HasSubstr("Is this a '" + format_->type_name() + "' file?"))); } void TestInspectFailureWithRelevantError(StatusCode code, const std::string& format_name) { const std::vector file_contents{"", "PAR0", "ASDFPAR1", "ARROW1"}; for (const auto& contents : file_contents) { AssertInspectFailure(contents, code, format_name); } } void TestInspect() { auto reader = GetRecordBatchReader(schema({field("f64", float64())})); auto source = GetFileSource(reader.get()); ASSERT_OK_AND_ASSIGN(auto actual, format_->Inspect(*source.get())); AssertSchemaEqual(*actual, *reader->schema(), /*check_metadata=*/false); } void TestIsSupported() { auto reader = GetRecordBatchReader(schema({field("f64", float64())})); auto source = GetFileSource(reader.get()); bool supported = false; std::shared_ptr buf = std::make_shared(util::string_view("")); ASSERT_OK_AND_ASSIGN(supported, format_->IsSupported(FileSource(buf))); ASSERT_EQ(supported, false); buf = std::make_shared(util::string_view("corrupted")); ASSERT_OK_AND_ASSIGN(supported, format_->IsSupported(FileSource(buf))); ASSERT_EQ(supported, false); ASSERT_OK_AND_ASSIGN(supported, format_->IsSupported(*source)); EXPECT_EQ(supported, true); } std::shared_ptr WriteToBuffer( std::shared_ptr schema, std::shared_ptr options = nullptr) { auto format = format_; SetSchema(schema->fields()); EXPECT_OK_AND_ASSIGN(auto sink, GetFileSink()); if (!options) options = format->DefaultWriteOptions(); EXPECT_OK_AND_ASSIGN(auto fs, fs::internal::MockFileSystem::Make(fs::kNoTime, {})); EXPECT_OK_AND_ASSIGN(auto writer, format->MakeWriter(sink, schema, options, {fs, ""})); ARROW_EXPECT_OK(writer->Write(GetRecordBatchReader(schema).get())); auto fut = writer->Finish(); EXPECT_FINISHES(fut); ARROW_EXPECT_OK(fut.status()); EXPECT_OK_AND_ASSIGN(auto written, sink->Finish()); return written; } void TestWrite() { auto reader = this->GetRecordBatchReader(schema({field("f64", float64())})); auto source = this->GetFileSource(reader.get()); auto written = this->WriteToBuffer(reader->schema()); AssertBufferEqual(*written, *source->buffer()); } void TestCountRows() { auto options = std::make_shared(); auto reader = this->GetRecordBatchReader(schema({field("f64", float64())})); auto full_schema = schema({field("f64", float64()), field("part", int64())}); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); ASSERT_FINISHES_OK_AND_EQ(util::make_optional(expected_rows()), fragment->CountRows(literal(true), options)); fragment = this->MakeFragment(*source, equal(field_ref("part"), literal(2))); ASSERT_FINISHES_OK_AND_EQ(util::make_optional(expected_rows()), fragment->CountRows(literal(true), options)); auto predicate = equal(field_ref("part"), literal(1)); ASSERT_OK_AND_ASSIGN(predicate, predicate.Bind(*full_schema)); ASSERT_FINISHES_OK_AND_EQ(util::make_optional(0), fragment->CountRows(predicate, options)); predicate = equal(field_ref("part"), literal(2)); ASSERT_OK_AND_ASSIGN(predicate, predicate.Bind(*full_schema)); ASSERT_FINISHES_OK_AND_EQ(util::make_optional(expected_rows()), fragment->CountRows(predicate, options)); predicate = equal(call("add", {field_ref("f64"), literal(3)}), literal(2)); ASSERT_OK_AND_ASSIGN(predicate, predicate.Bind(*full_schema)); ASSERT_FINISHES_OK_AND_EQ(util::nullopt, fragment->CountRows(predicate, options)); } protected: std::shared_ptr format_; std::shared_ptr opts_; }; template class FileFormatScanMixin : public FileFormatFixtureMixin, public ::testing::WithParamInterface { public: int64_t expected_batches() const { return GetParam().num_batches; } int64_t expected_rows() const { return GetParam().expected_rows(); } std::shared_ptr GetRecordBatchReader( std::shared_ptr schema) override { return MakeGeneratedRecordBatch(schema, GetParam().items_per_batch, GetParam().num_batches); } // Scan the fragment through the scanner. RecordBatchIterator Batches(std::shared_ptr fragment) { auto dataset = std::make_shared(opts_->dataset_schema, FragmentVector{fragment}); ScannerBuilder builder(dataset, opts_); ARROW_EXPECT_OK(builder.UseThreads(GetParam().use_threads)); EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish()); EXPECT_OK_AND_ASSIGN(auto batch_it, scanner->ScanBatches()); return MakeMapIterator([](TaggedRecordBatch tagged) { return tagged.record_batch; }, std::move(batch_it)); } // Scan the fragment directly, without using the scanner. RecordBatchIterator PhysicalBatches(std::shared_ptr fragment) { opts_->use_threads = GetParam().use_threads; EXPECT_OK_AND_ASSIGN(auto batch_gen, fragment->ScanBatchesAsync(opts_)); auto batch_it = MakeGeneratorIterator(std::move(batch_gen)); return batch_it; } // Shared test cases void TestScan() { auto reader = GetRecordBatchReader(schema({field("f64", float64())})); auto source = this->GetFileSource(reader.get()); this->SetSchema(reader->schema()->fields()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : Batches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); } ASSERT_EQ(row_count, GetParam().expected_rows()); } // Ensure batch_size is respected void TestScanBatchSize() { constexpr int kBatchSize = 17; auto reader = GetRecordBatchReader(schema({field("f64", float64())})); auto source = this->GetFileSource(reader.get()); this->SetSchema(reader->schema()->fields()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; opts_->batch_size = kBatchSize; for (auto maybe_batch : Batches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); ASSERT_LE(batch->num_rows(), kBatchSize); row_count += batch->num_rows(); } ASSERT_EQ(row_count, GetParam().expected_rows()); } // Ensure file formats only return columns needed to fulfill filter/projection void TestScanProjected() { auto f32 = field("f32", float32()); auto f64 = field("f64", float64()); auto i32 = field("i32", int32()); auto i64 = field("i64", int64()); this->SetSchema({f64, i64, f32, i32}); this->Project({"f64"}); this->SetFilter(equal(field_ref("i32"), literal(0))); // NB: projection is applied by the scanner; FileFragment does not evaluate it so // we will not drop "i32" even though it is not projected since we need it for // filtering auto expected_schema = schema({f64, i32}); auto reader = this->GetRecordBatchReader(opts_->dataset_schema); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); ASSERT_THAT( batch->schema()->fields(), ::testing::UnorderedPointwise(PointeesEqual(), expected_schema->fields())) << "EXPECTED:\n" << expected_schema->ToString() << "\nACTUAL:\n" << batch->schema()->ToString(); } ASSERT_EQ(row_count, expected_rows()); } void TestScanProjectedNested(bool fine_grained_selection = false) { auto f32 = field("f32", float32()); auto f64 = field("f64", float64()); auto i32 = field("i32", int32()); auto i64 = field("i64", int64()); auto struct1 = field("struct1", struct_({f32, i32})); auto struct2 = field("struct2", struct_({f64, i64, struct1})); this->SetSchema({struct1, struct2, f32, f64, i32, i64}); this->ProjectNested({".struct1.f32", ".struct2.struct1", ".struct2.struct1.f32"}); this->SetFilter(greater_equal(field_ref(FieldRef("struct2", "i64")), literal(0))); std::shared_ptr physical_schema; if (fine_grained_selection) { // Some formats, like Parquet, let you pluck only a part of a complex type physical_schema = schema( {field("struct1", struct_({f32})), field("struct2", struct_({i64, struct1}))}); } else { // Otherwise, the entire top-level field is returned physical_schema = schema({struct1, struct2}); } std::shared_ptr projected_schema = schema({ field(".struct1.f32", float32()), field(".struct2.struct1", struct1->type()), field(".struct2.struct1.f32", float32()), }); { auto reader = this->GetRecordBatchReader(opts_->dataset_schema); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); ASSERT_THAT( batch->schema()->fields(), ::testing::UnorderedPointwise(PointeesEqual(), physical_schema->fields())) << "EXPECTED:\n" << physical_schema->ToString() << "\nACTUAL:\n" << batch->schema()->ToString(); } ASSERT_EQ(row_count, expected_rows()); } { auto reader = this->GetRecordBatchReader(opts_->dataset_schema); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : Batches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); AssertSchemaEqual(*batch->schema(), *projected_schema, /*check_metadata=*/false); } ASSERT_LE(row_count, expected_rows()); ASSERT_GT(row_count, 0); } { // File includes a duplicated name in struct2 auto struct2_physical = field("struct2", struct_({f64, i64, struct1, i64})); auto reader = this->GetRecordBatchReader( schema({struct1, struct2_physical, f32, f64, i32, i64})); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); auto iterator = PhysicalBatches(fragment); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("i64"), iterator.Next().status()); } { // File is missing a child in struct1 auto struct1_physical = field("struct1", struct_({i32})); auto reader = this->GetRecordBatchReader( schema({struct1_physical, struct2, f32, f64, i32, i64})); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); physical_schema = schema({physical_schema->field(1)}); int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); ASSERT_THAT( batch->schema()->fields(), ::testing::UnorderedPointwise(PointeesEqual(), physical_schema->fields())) << "EXPECTED:\n" << physical_schema->ToString() << "\nACTUAL:\n" << batch->schema()->ToString(); } ASSERT_EQ(row_count, expected_rows()); } } void TestScanProjectedMissingCols() { auto f32 = field("f32", float32()); auto f64 = field("f64", float64()); auto i32 = field("i32", int32()); auto i64 = field("i64", int64()); this->SetSchema({f64, i64, f32, i32}); this->Project({"f64"}); this->SetFilter(equal(field_ref("i32"), literal(0))); auto reader_without_i32 = this->GetRecordBatchReader(schema({f64, i64, f32})); auto reader_without_f64 = this->GetRecordBatchReader(schema({i64, f32, i32})); auto reader = this->GetRecordBatchReader(schema({f64, i64, f32, i32})); auto readers = {reader.get(), reader_without_i32.get(), reader_without_f64.get()}; for (auto reader : readers) { SCOPED_TRACE(reader->schema()->ToString()); auto source = this->GetFileSource(reader); auto fragment = this->MakeFragment(*source); // NB: projection is applied by the scanner; FileFragment does not evaluate it so // we will not drop "i32" even though it is not projected since we need it for // filtering // // in the case where a file doesn't contain a referenced field, we won't // materialize it as nulls later std::shared_ptr expected_schema; if (reader == reader_without_i32.get()) { expected_schema = schema({f64}); } else if (reader == reader_without_f64.get()) { expected_schema = schema({i32}); } else { expected_schema = schema({f64, i32}); } int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); ASSERT_THAT( batch->schema()->fields(), ::testing::UnorderedPointwise(PointeesEqual(), expected_schema->fields())) << "EXPECTED:\n" << expected_schema->ToString() << "\nACTUAL:\n" << batch->schema()->ToString(); } ASSERT_EQ(row_count, expected_rows()); } } void TestScanWithVirtualColumn() { auto reader = this->GetRecordBatchReader(schema({field("f64", float64())})); auto source = this->GetFileSource(reader.get()); // NB: dataset_schema includes a column not present in the file this->SetSchema({reader->schema()->field(0), field("virtual", int32())}); auto fragment = this->MakeFragment(*source); ASSERT_OK_AND_ASSIGN(auto physical_schema, fragment->ReadPhysicalSchema()); AssertSchemaEqual(Schema({field("f64", float64())}), *physical_schema); { int64_t row_count = 0; for (auto maybe_batch : Batches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); AssertSchemaEqual(*batch->schema(), *opts_->projected_schema); row_count += batch->num_rows(); } ASSERT_EQ(row_count, expected_rows()); } { int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); AssertSchemaEqual(*batch->schema(), *physical_schema); row_count += batch->num_rows(); } ASSERT_EQ(row_count, expected_rows()); } } void TestScanWithDuplicateColumn() { // A duplicate column is ignored if not requested. auto i32 = field("i32", int32()); auto i64 = field("i64", int64()); this->opts_->dataset_schema = schema({i32, i32, i64}); this->Project({"i64"}); auto expected_schema = schema({i64}); auto reader = this->GetRecordBatchReader(opts_->dataset_schema); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : PhysicalBatches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); AssertSchemaEqual(*batch->schema(), *expected_schema, /*check_metadata=*/false); } ASSERT_EQ(row_count, expected_rows()); } void TestScanWithDuplicateColumnError() { // A duplicate column leads to an error if requested. auto i32 = field("i32", int32()); auto i64 = field("i64", int64()); this->opts_->dataset_schema = schema({i32, i32, i64}); ASSERT_RAISES(Invalid, ProjectionDescr::FromNames({"i32"}, *this->opts_->dataset_schema)); } void TestScanWithPushdownNulls() { // Regression test for ARROW-15312 auto i64 = field("i64", int64()); this->SetSchema({i64}); this->SetFilter(is_null(field_ref("i64"))); auto rb = RecordBatchFromJSON(schema({i64}), R"([ [null], [32] ])"); ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make({rb})); auto source = this->GetFileSource(reader.get()); auto fragment = this->MakeFragment(*source); int64_t row_count = 0; for (auto maybe_batch : Batches(fragment)) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); row_count += batch->num_rows(); } ASSERT_EQ(row_count, 1); } protected: using FileFormatFixtureMixin::opts_; }; /// \brief A dummy FileFormat implementation class DummyFileFormat : public FileFormat { public: explicit DummyFileFormat(std::shared_ptr schema = NULLPTR) : schema_(std::move(schema)) {} std::string type_name() const override { return "dummy"; } bool Equals(const FileFormat& other) const override { return type_name() == other.type_name() && schema_->Equals(checked_cast(other).schema_); } Result IsSupported(const FileSource& source) const override { return true; } Result> Inspect(const FileSource& source) const override { return schema_; } /// \brief Open a file for scanning (always returns an empty generator) Result ScanBatchesAsync( const std::shared_ptr& options, const std::shared_ptr& fragment) const override { return MakeEmptyGenerator>(); } Result> MakeWriter( std::shared_ptr destination, std::shared_ptr schema, std::shared_ptr options, fs::FileLocator destination_locator) const override { return Status::NotImplemented("writing fragment of DummyFileFormat"); } std::shared_ptr DefaultWriteOptions() override { return nullptr; } protected: std::shared_ptr schema_; }; class JSONRecordBatchFileFormat : public FileFormat { public: using SchemaResolver = std::function(const FileSource&)>; explicit JSONRecordBatchFileFormat(std::shared_ptr schema) : resolver_([schema](const FileSource&) { return schema; }) {} explicit JSONRecordBatchFileFormat(SchemaResolver resolver) : resolver_(std::move(resolver)) {} bool Equals(const FileFormat& other) const override { return this == &other; } std::string type_name() const override { return "json_record_batch"; } /// \brief Return true if the given file extension Result IsSupported(const FileSource& source) const override { return true; } Result> Inspect(const FileSource& source) const override { return resolver_(source); } Result ScanBatchesAsync( const std::shared_ptr& options, const std::shared_ptr& fragment) const override { ARROW_ASSIGN_OR_RAISE(auto file, fragment->source().Open()); ARROW_ASSIGN_OR_RAISE(int64_t size, file->GetSize()); ARROW_ASSIGN_OR_RAISE(auto buffer, file->Read(size)); ARROW_ASSIGN_OR_RAISE(auto schema, Inspect(fragment->source())); RecordBatchVector batches{RecordBatchFromJSON(schema, util::string_view{*buffer})}; return MakeVectorGenerator(std::move(batches)); } Result> MakeWriter( std::shared_ptr destination, std::shared_ptr schema, std::shared_ptr options, fs::FileLocator destination_locator) const override { return Status::NotImplemented("writing fragment of JSONRecordBatchFileFormat"); } std::shared_ptr DefaultWriteOptions() override { return nullptr; } protected: SchemaResolver resolver_; }; struct MakeFileSystemDatasetMixin { std::vector ParsePathList(const std::string& pathlist) { std::vector infos; std::stringstream ss(pathlist); std::string line; while (std::getline(ss, line)) { auto start = line.find_first_not_of(" \n\r\t"); if (start == std::string::npos) { continue; } line.erase(0, start); if (line.front() == '#') { continue; } if (line.back() == '/') { infos.push_back(fs::Dir(line)); continue; } infos.push_back(fs::File(line)); } return infos; } void MakeFileSystem(const std::vector& infos) { ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, infos)); } void MakeFileSystem(const std::vector& paths) { std::vector infos{paths.size()}; std::transform(paths.cbegin(), paths.cend(), infos.begin(), [](const std::string& p) { return fs::File(p); }); ASSERT_OK_AND_ASSIGN(fs_, fs::internal::MockFileSystem::Make(fs::kNoTime, infos)); } void MakeDataset(const std::vector& infos, compute::Expression root_partition = literal(true), std::vector partitions = {}, std::shared_ptr s = schema({})) { auto n_fragments = infos.size(); if (partitions.empty()) { partitions.resize(n_fragments, literal(true)); } MakeFileSystem(infos); auto format = std::make_shared(s); std::vector> fragments; for (size_t i = 0; i < n_fragments; i++) { const auto& info = infos[i]; if (!info.IsFile()) { continue; } ASSERT_OK_AND_ASSIGN(auto fragment, format->MakeFragment({info, fs_}, partitions[i])); fragments.push_back(std::move(fragment)); } ASSERT_OK_AND_ASSIGN(dataset_, FileSystemDataset::Make(s, root_partition, format, fs_, std::move(fragments))); } std::shared_ptr fs_; std::shared_ptr dataset_; std::shared_ptr options_; }; static const std::string& PathOf(const std::shared_ptr& fragment) { EXPECT_NE(fragment, nullptr); EXPECT_THAT(fragment->type_name(), "dummy"); return checked_cast(*fragment).source().path(); } class TestFileSystemDataset : public ::testing::Test, public MakeFileSystemDatasetMixin {}; static std::vector PathsOf(const FragmentVector& fragments) { std::vector paths(fragments.size()); std::transform(fragments.begin(), fragments.end(), paths.begin(), PathOf); return paths; } void AssertFilesAre(const std::shared_ptr& dataset, std::vector expected) { auto fs_dataset = checked_cast(dataset.get()); EXPECT_THAT(fs_dataset->files(), testing::UnorderedElementsAreArray(expected)); } void AssertFragmentsAreFromPath(FragmentIterator it, std::vector expected) { // Ordering is not guaranteed. EXPECT_THAT(PathsOf(IteratorToVector(std::move(it))), testing::UnorderedElementsAreArray(expected)); } static std::vector PartitionExpressionsOf( const FragmentVector& fragments) { std::vector partition_expressions; std::transform(fragments.begin(), fragments.end(), std::back_inserter(partition_expressions), [](const std::shared_ptr& fragment) { return fragment->partition_expression(); }); return partition_expressions; } void AssertFragmentsHavePartitionExpressions(std::shared_ptr dataset, std::vector expected) { ASSERT_OK_AND_ASSIGN(auto fragment_it, dataset->GetFragments()); // Ordering is not guaranteed. EXPECT_THAT(PartitionExpressionsOf(IteratorToVector(std::move(fragment_it))), testing::UnorderedElementsAreArray(expected)); } struct ArithmeticDatasetFixture { static std::shared_ptr schema() { return ::arrow::schema({ field("i64", int64()), field("struct", struct_({ field("i32", int32()), field("str", utf8()), })), field("u8", uint8()), field("list", list(int32())), field("bool", boolean()), }); } /// \brief Creates a single JSON record templated with n as follow. /// /// {"i64": n, "struct": {"i32": n, "str": "n"}, "u8": n "list": [n,n], "bool": n % /// 2}, static std::string JSONRecordFor(int64_t n) { std::stringstream ss; auto n_i32 = static_cast(n); ss << "{"; ss << "\"i64\": " << n << ", "; ss << "\"struct\": {"; { ss << "\"i32\": " << n_i32 << ", "; ss << R"("str": ")" << std::to_string(n) << "\""; } ss << "}, "; ss << "\"u8\": " << static_cast(n) << ", "; ss << "\"list\": [" << n_i32 << ", " << n_i32 << "], "; ss << "\"bool\": " << (static_cast(n % 2) ? "true" : "false"); ss << "}"; return ss.str(); } /// \brief Creates a JSON RecordBatch static std::string JSONRecordBatch(int64_t n) { DCHECK_GT(n, 0); auto record = JSONRecordFor(n); std::stringstream ss; ss << "[\n"; for (int64_t i = 1; i <= n; i++) { if (i != 1) { ss << "\n,"; } ss << record; } ss << "]\n"; return ss.str(); } static std::shared_ptr GetRecordBatch(int64_t n) { return RecordBatchFromJSON(ArithmeticDatasetFixture::schema(), JSONRecordBatch(n)); } static std::unique_ptr GetRecordBatchReader(int64_t n) { DCHECK_GT(n, 0); // Functor which generates `n` RecordBatch struct { Status operator()(std::shared_ptr* out) { *out = i++ < count ? GetRecordBatch(i) : nullptr; return Status::OK(); } int64_t i; int64_t count; } generator{0, n}; return MakeGeneratedRecordBatch(schema(), std::move(generator)); } }; class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { public: using PathAndContent = std::unordered_map; void MakeSourceDataset() { PathAndContent source_files; source_files["/dataset/year=2018/month=01/dat0.json"] = R"([ {"region": "NY", "model": "3", "sales": 742.0, "country": "US"}, {"region": "NY", "model": "S", "sales": 304.125, "country": "US"}, {"region": "NY", "model": "Y", "sales": 27.5, "country": "US"} ])"; source_files["/dataset/year=2018/month=01/dat1.json"] = R"([ {"region": "QC", "model": "3", "sales": 512, "country": "CA"}, {"region": "QC", "model": "S", "sales": 978, "country": "CA"}, {"region": "NY", "model": "X", "sales": 136.25, "country": "US"}, {"region": "QC", "model": "X", "sales": 1.0, "country": "CA"}, {"region": "QC", "model": "Y", "sales": 69, "country": "CA"} ])"; source_files["/dataset/year=2019/month=01/dat0.json"] = R"([ {"region": "CA", "model": "3", "sales": 273.5, "country": "US"}, {"region": "CA", "model": "S", "sales": 13, "country": "US"}, {"region": "CA", "model": "X", "sales": 54, "country": "US"}, {"region": "QC", "model": "S", "sales": 10, "country": "CA"}, {"region": "CA", "model": "Y", "sales": 21, "country": "US"} ])"; source_files["/dataset/year=2019/month=01/dat1.json"] = R"([ {"region": "QC", "model": "3", "sales": 152.25, "country": "CA"}, {"region": "QC", "model": "X", "sales": 42, "country": "CA"}, {"region": "QC", "model": "Y", "sales": 37, "country": "CA"} ])"; source_files["/dataset/.pesky"] = "garbage content"; auto mock_fs = std::make_shared(fs::kNoTime); for (const auto& f : source_files) { ARROW_EXPECT_OK(mock_fs->CreateFile(f.first, f.second, /* recursive */ true)); } fs_ = mock_fs; /// schema for the whole dataset (both source and destination) source_schema_ = schema({ field("region", utf8()), field("model", utf8()), field("sales", float64()), field("year", int32()), field("month", int32()), field("country", utf8()), }); /// Dummy file format for source dataset. Note that it isn't partitioned on country auto source_format = std::make_shared( SchemaFromColumnNames(source_schema_, {"region", "model", "sales", "country"})); fs::FileSelector s; s.base_dir = "/dataset"; s.recursive = true; FileSystemFactoryOptions options; options.selector_ignore_prefixes = {"."}; options.partitioning = std::make_shared( SchemaFromColumnNames(source_schema_, {"year", "month"})); ASSERT_OK_AND_ASSIGN(auto factory, FileSystemDatasetFactory::Make(fs_, s, source_format, options)); ASSERT_OK_AND_ASSIGN(dataset_, factory->Finish()); scan_options_ = std::make_shared(); scan_options_->dataset_schema = dataset_->schema(); ASSERT_OK_AND_ASSIGN( auto projection, ProjectionDescr::FromNames(source_schema_->field_names(), *dataset_->schema())); SetProjection(scan_options_.get(), std::move(projection)); } void SetWriteOptions(std::shared_ptr file_write_options) { write_options_.file_write_options = file_write_options; write_options_.filesystem = fs_; write_options_.base_dir = "/new_root/"; write_options_.basename_template = "dat_{i}"; write_options_.writer_pre_finish = [this](FileWriter* writer) { visited_paths_.push_back(writer->destination().path); return Status::OK(); }; } void DoWrite(std::shared_ptr desired_partitioning) { write_options_.partitioning = desired_partitioning; auto scanner_builder = ScannerBuilder(dataset_, scan_options_); ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder.Finish()); ASSERT_OK(FileSystemDataset::Write(write_options_, scanner)); // re-discover the written dataset fs::FileSelector s; s.recursive = true; s.base_dir = "/new_root"; FileSystemFactoryOptions factory_options; factory_options.partitioning = desired_partitioning; ASSERT_OK_AND_ASSIGN( auto factory, FileSystemDatasetFactory::Make(fs_, s, format_, factory_options)); ASSERT_OK_AND_ASSIGN(written_, factory->Finish()); } void TestWriteWithIdenticalPartitioningSchema() { DoWrite(std::make_shared( SchemaFromColumnNames(source_schema_, {"year", "month"}))); expected_files_["/new_root/2018/1/dat_0"] = R"([ {"region": "QC", "model": "X", "sales": 1.0, "country": "CA"}, {"region": "NY", "model": "Y", "sales": 27.5, "country": "US"}, {"region": "QC", "model": "Y", "sales": 69, "country": "CA"}, {"region": "NY", "model": "X", "sales": 136.25, "country": "US"}, {"region": "NY", "model": "S", "sales": 304.125, "country": "US"}, {"region": "QC", "model": "3", "sales": 512, "country": "CA"}, {"region": "NY", "model": "3", "sales": 742.0, "country": "US"}, {"region": "QC", "model": "S", "sales": 978, "country": "CA"} ])"; expected_files_["/new_root/2019/1/dat_0"] = R"([ {"region": "QC", "model": "S", "sales": 10, "country": "CA"}, {"region": "CA", "model": "S", "sales": 13, "country": "US"}, {"region": "CA", "model": "Y", "sales": 21, "country": "US"}, {"region": "QC", "model": "Y", "sales": 37, "country": "CA"}, {"region": "QC", "model": "X", "sales": 42, "country": "CA"}, {"region": "CA", "model": "X", "sales": 54, "country": "US"}, {"region": "QC", "model": "3", "sales": 152.25, "country": "CA"}, {"region": "CA", "model": "3", "sales": 273.5, "country": "US"} ])"; expected_physical_schema_ = SchemaFromColumnNames(source_schema_, {"region", "model", "sales", "country"}); AssertWrittenAsExpected(); } void TestWriteWithUnrelatedPartitioningSchema() { DoWrite(std::make_shared( SchemaFromColumnNames(source_schema_, {"country", "region"}))); // XXX first thing a user will be annoyed by: we don't support left // padding the month field with 0. expected_files_["/new_root/US/NY/dat_0"] = R"([ {"year": 2018, "month": 1, "model": "Y", "sales": 27.5}, {"year": 2018, "month": 1, "model": "X", "sales": 136.25}, {"year": 2018, "month": 1, "model": "S", "sales": 304.125}, {"year": 2018, "month": 1, "model": "3", "sales": 742.0} ])"; expected_files_["/new_root/CA/QC/dat_0"] = R"([ {"year": 2018, "month": 1, "model": "X", "sales": 1.0}, {"year": 2019, "month": 1, "model": "S", "sales": 10}, {"year": 2019, "month": 1, "model": "Y", "sales": 37}, {"year": 2019, "month": 1, "model": "X", "sales": 42}, {"year": 2018, "month": 1, "model": "Y", "sales": 69}, {"year": 2019, "month": 1, "model": "3", "sales": 152.25}, {"year": 2018, "month": 1, "model": "3", "sales": 512}, {"year": 2018, "month": 1, "model": "S", "sales": 978} ])"; expected_files_["/new_root/US/CA/dat_0"] = R"([ {"year": 2019, "month": 1, "model": "S", "sales": 13}, {"year": 2019, "month": 1, "model": "Y", "sales": 21}, {"year": 2019, "month": 1, "model": "X", "sales": 54}, {"year": 2019, "month": 1, "model": "3", "sales": 273.5} ])"; expected_physical_schema_ = SchemaFromColumnNames(source_schema_, {"model", "sales", "year", "month"}); AssertWrittenAsExpected(); } void TestWriteWithSupersetPartitioningSchema() { DoWrite(std::make_shared( SchemaFromColumnNames(source_schema_, {"year", "month", "country", "region"}))); // XXX first thing a user will be annoyed by: we don't support left // padding the month field with 0. expected_files_["/new_root/2018/1/US/NY/dat_0"] = R"([ {"model": "Y", "sales": 27.5}, {"model": "X", "sales": 136.25}, {"model": "S", "sales": 304.125}, {"model": "3", "sales": 742.0} ])"; expected_files_["/new_root/2018/1/CA/QC/dat_0"] = R"([ {"model": "X", "sales": 1.0}, {"model": "Y", "sales": 69}, {"model": "3", "sales": 512}, {"model": "S", "sales": 978} ])"; expected_files_["/new_root/2019/1/US/CA/dat_0"] = R"([ {"model": "S", "sales": 13}, {"model": "Y", "sales": 21}, {"model": "X", "sales": 54}, {"model": "3", "sales": 273.5} ])"; expected_files_["/new_root/2019/1/CA/QC/dat_0"] = R"([ {"model": "S", "sales": 10}, {"model": "Y", "sales": 37}, {"model": "X", "sales": 42}, {"model": "3", "sales": 152.25} ])"; expected_physical_schema_ = SchemaFromColumnNames(source_schema_, {"model", "sales"}); AssertWrittenAsExpected(); } void TestWriteWithEmptyPartitioningSchema() { DoWrite(std::make_shared( SchemaFromColumnNames(source_schema_, {}))); expected_files_["/new_root/dat_0"] = R"([ {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "X", "sales": 1.0}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "S", "sales": 10}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "S", "sales": 13}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "Y", "sales": 21}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "Y", "sales": 27.5}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "Y", "sales": 37}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "X", "sales": 42}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "X", "sales": 54}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "Y", "sales": 69}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "X", "sales": 136.25}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "3", "sales": 152.25}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "3", "sales": 273.5}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "S", "sales": 304.125}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "3", "sales": 512}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "3", "sales": 742.0}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "S", "sales": 978} ])"; expected_physical_schema_ = source_schema_; AssertWrittenAsExpected(); } void AssertWrittenAsExpected() { std::unordered_set expected_paths, actual_paths; for (const auto& file_contents : expected_files_) { expected_paths.insert(file_contents.first); } // expect the written filesystem to contain precisely the paths we expected for (auto path : checked_pointer_cast(written_)->files()) { actual_paths.insert(std::move(path)); } EXPECT_THAT(actual_paths, testing::UnorderedElementsAreArray(expected_paths)); // Additionally, the writer producing each written file was visited and its path // collected. That should match the expected paths as well EXPECT_THAT(visited_paths_, testing::UnorderedElementsAreArray(expected_paths)); ASSERT_OK_AND_ASSIGN(auto written_fragments_it, written_->GetFragments()); for (auto maybe_fragment : written_fragments_it) { ASSERT_OK_AND_ASSIGN(auto fragment, maybe_fragment); ASSERT_OK_AND_ASSIGN(auto actual_physical_schema, fragment->ReadPhysicalSchema()); AssertSchemaEqual(*expected_physical_schema_, *actual_physical_schema, check_metadata_); const auto& path = checked_pointer_cast(fragment)->source().path(); auto file_contents = expected_files_.find(path); if (file_contents == expected_files_.end()) { // file wasn't expected to be written at all; nothing to compare with continue; } ASSERT_OK_AND_ASSIGN(auto scanner, ScannerBuilder(actual_physical_schema, fragment, std::make_shared()) .Finish()); ASSERT_OK_AND_ASSIGN(auto actual_table, scanner->ToTable()); ASSERT_OK_AND_ASSIGN(actual_table, actual_table->CombineChunks()); std::shared_ptr actual_struct; for (auto maybe_batch : MakeIteratorFromReader(std::make_shared(*actual_table))) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); ASSERT_OK_AND_ASSIGN( auto sort_indices, compute::SortIndices(batch->GetColumnByName("sales"), compute::SortOptions({compute::SortKey{"sales"}}))); ASSERT_OK_AND_ASSIGN(Datum sorted_batch, compute::Take(batch, sort_indices)); ASSERT_OK_AND_ASSIGN(auto struct_array, sorted_batch.record_batch()->ToStructArray()); actual_struct = std::dynamic_pointer_cast(struct_array); } auto expected_struct = ArrayFromJSON(struct_(expected_physical_schema_->fields()), {file_contents->second}); AssertArraysEqual(*expected_struct, *actual_struct, /*verbose=*/true); } } bool check_metadata_ = true; std::shared_ptr source_schema_; std::shared_ptr format_; PathAndContent expected_files_; std::shared_ptr expected_physical_schema_; std::shared_ptr written_; std::vector visited_paths_; FileSystemDatasetWriteOptions write_options_; std::shared_ptr scan_options_; }; } // namespace dataset } // namespace arrow