// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include "arrow/buffer.h" #include "arrow/compute/type_fwd.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" #include "arrow/util/mutex.h" #include "arrow/util/optional.h" #include "arrow/util/thread_pool.h" #if defined(__clang__) || defined(__GNUC__) #define BYTESWAP(x) __builtin_bswap64(x) #define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) #define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) #define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) #elif defined(_MSC_VER) #include #define BYTESWAP(x) _byteswap_uint64(x) #define ROTL(x, n) _rotl((x), (n)) #define ROTL64(x, n) _rotl64((x), (n)) #if defined(_M_X64) || defined(_M_I86) #include // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx #define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) #else #define PREFETCH(ptr) (void)(ptr) /* disabled */ #endif #endif namespace arrow { namespace util { template inline void CheckAlignment(const void* ptr) { ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); } // Some platforms typedef int64_t as long int instead of long long int, // which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics // which need long long. // We use the cast to the type below in these intrinsics to make the code // compile in all cases. // using int64_for_gather_t = const long long int; // NOLINT runtime-int // All MiniBatch... classes use TempVectorStack for vector allocations and can // only work with vectors up to 1024 elements. // // They should only be allocated on the stack to guarantee the right sequence // of allocation and deallocation of vectors from TempVectorStack. // class MiniBatch { public: static constexpr int kMiniBatchLength = 1024; }; /// Storage used to allocate temporary vectors of a batch size. /// Temporary vectors should resemble allocating temporary variables on the stack /// but in the context of vectorized processing where we need to store a vector of /// temporaries instead of a single value. class TempVectorStack { template friend class TempVectorHolder; public: Status Init(MemoryPool* pool, int64_t size) { num_vectors_ = 0; top_ = 0; buffer_size_ = size; ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); // Ensure later operations don't accidentally read uninitialized memory. std::memset(buffer->mutable_data(), 0xFF, size); buffer_ = std::move(buffer); return Status::OK(); } private: int64_t PaddedAllocationSize(int64_t num_bytes) { // Round up allocation size to multiple of 8 bytes // to avoid returning temp vectors with unaligned address. // // Also add padding at the end to facilitate loads and stores // using SIMD when number of vector elements is not divisible // by the number of SIMD lanes. // return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; } void alloc(uint32_t num_bytes, uint8_t** data, int* id) { int64_t old_top = top_; top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); // Stack overflow check ARROW_DCHECK(top_ <= buffer_size_); *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); // We set 8 bytes before the beginning of the allocated range and // 8 bytes after the end to check for stack overflow (which would // result in those known bytes being corrupted). reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; *id = num_vectors_++; } void release(int id, uint32_t num_bytes) { ARROW_DCHECK(num_vectors_ == id + 1); int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == kGuard2); ARROW_DCHECK(top_ >= size); top_ -= size; ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == kGuard1); --num_vectors_; } static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; static constexpr int64_t kPadding = 64; int num_vectors_; int64_t top_; std::unique_ptr buffer_; int64_t buffer_size_; }; template class TempVectorHolder { friend class TempVectorStack; public: ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } T* mutable_data() { return reinterpret_cast(data_); } TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { stack_ = stack; num_elements_ = num_elements; stack_->alloc(num_elements * sizeof(T), &data_, &id_); } private: TempVectorStack* stack_; uint8_t* data_; int id_; uint32_t num_elements_; }; class bit_util { public: static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits, const uint8_t* bits, int* num_indexes, uint16_t* indexes, int bit_offset = 0); static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes, int bit_offset = 0); // Input and output indexes may be pointing to the same data (in-place filtering). static void bits_split_indexes(int64_t hardware_flags, const int num_bits, const uint8_t* bits, int* num_indexes_bit0, uint16_t* indexes_bit0, uint16_t* indexes_bit1, int bit_offset = 0); // Bit 1 is replaced with byte 0xFF. static void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); // Return highest bit of each byte. static void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint32_t num_bytes); private: inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, int* num_indexes, uint16_t* indexes); inline static void bits_filter_indexes_helper(uint64_t word, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes); template static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes, uint16_t base_index = 0); #if defined(ARROW_HAVE_AVX2) static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, int* num_indexes, uint16_t* indexes, uint16_t base_index = 0); static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes); template static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_indexes, uint16_t* indexes, uint16_t base_index = 0); template static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, const uint16_t* input_indexes, int* num_indexes, uint16_t* indexes); static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); #endif }; } // namespace util namespace compute { ARROW_EXPORT Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector& inputs, int expected_num_inputs, const char* kind_name); ARROW_EXPORT Result> TableFromExecBatches( const std::shared_ptr& schema, const std::vector& exec_batches); class AtomicCounter { public: AtomicCounter() = default; int count() const { return count_.load(); } util::optional total() const { int total = total_.load(); if (total == -1) return {}; return total; } // return true if the counter is complete bool Increment() { DCHECK_NE(count_.load(), total_.load()); int count = count_.fetch_add(1) + 1; if (count != total_.load()) return false; return DoneOnce(); } // return true if the counter is complete bool SetTotal(int total) { total_.store(total); if (count_.load() != total) return false; return DoneOnce(); } // return true if the counter has not already been completed bool Cancel() { return DoneOnce(); } // return true if the counter has finished or been cancelled bool Completed() { return complete_.load(); } private: // ensure there is only one true return from Increment(), SetTotal(), or Cancel() bool DoneOnce() { bool expected = false; return complete_.compare_exchange_strong(expected, true); } std::atomic count_{0}, total_{-1}; std::atomic complete_{false}; }; class ThreadIndexer { public: size_t operator()(); static size_t Capacity(); private: static size_t Check(size_t thread_index); util::Mutex mutex_; std::unordered_map id_to_index_; }; } // namespace compute } // namespace arrow