mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 18:32:15 +00:00
298 lines
11 KiB
C++
298 lines
11 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <cstdint>
|
|
#include <thread>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "arrow/buffer.h"
|
|
#include "arrow/compute/type_fwd.h"
|
|
#include "arrow/memory_pool.h"
|
|
#include "arrow/result.h"
|
|
#include "arrow/status.h"
|
|
#include "arrow/util/bit_util.h"
|
|
#include "arrow/util/cpu_info.h"
|
|
#include "arrow/util/logging.h"
|
|
#include "arrow/util/mutex.h"
|
|
#include "arrow/util/optional.h"
|
|
#include "arrow/util/thread_pool.h"
|
|
|
|
#if defined(__clang__) || defined(__GNUC__)
|
|
#define BYTESWAP(x) __builtin_bswap64(x)
|
|
#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
|
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
|
|
#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
|
|
#elif defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#define BYTESWAP(x) _byteswap_uint64(x)
|
|
#define ROTL(x, n) _rotl((x), (n))
|
|
#define ROTL64(x, n) _rotl64((x), (n))
|
|
#if defined(_M_X64) || defined(_M_I86)
|
|
#include <mmintrin.h> // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
|
|
#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
|
|
#else
|
|
#define PREFETCH(ptr) (void)(ptr) /* disabled */
|
|
#endif
|
|
#endif
|
|
|
|
namespace arrow {
|
|
namespace util {
|
|
|
|
template <typename T>
|
|
inline void CheckAlignment(const void* ptr) {
|
|
ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
|
|
}
|
|
|
|
// Some platforms typedef int64_t as long int instead of long long int,
|
|
// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
|
|
// which need long long.
|
|
// We use the cast to the type below in these intrinsics to make the code
|
|
// compile in all cases.
|
|
//
|
|
using int64_for_gather_t = const long long int; // NOLINT runtime-int
|
|
|
|
// All MiniBatch... classes use TempVectorStack for vector allocations and can
|
|
// only work with vectors up to 1024 elements.
|
|
//
|
|
// They should only be allocated on the stack to guarantee the right sequence
|
|
// of allocation and deallocation of vectors from TempVectorStack.
|
|
//
|
|
class MiniBatch {
|
|
public:
|
|
static constexpr int kMiniBatchLength = 1024;
|
|
};
|
|
|
|
/// Storage used to allocate temporary vectors of a batch size.
|
|
/// Temporary vectors should resemble allocating temporary variables on the stack
|
|
/// but in the context of vectorized processing where we need to store a vector of
|
|
/// temporaries instead of a single value.
|
|
class TempVectorStack {
|
|
template <typename>
|
|
friend class TempVectorHolder;
|
|
|
|
public:
|
|
Status Init(MemoryPool* pool, int64_t size) {
|
|
num_vectors_ = 0;
|
|
top_ = 0;
|
|
buffer_size_ = size;
|
|
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
|
|
// Ensure later operations don't accidentally read uninitialized memory.
|
|
std::memset(buffer->mutable_data(), 0xFF, size);
|
|
buffer_ = std::move(buffer);
|
|
return Status::OK();
|
|
}
|
|
|
|
private:
|
|
int64_t PaddedAllocationSize(int64_t num_bytes) {
|
|
// Round up allocation size to multiple of 8 bytes
|
|
// to avoid returning temp vectors with unaligned address.
|
|
//
|
|
// Also add padding at the end to facilitate loads and stores
|
|
// using SIMD when number of vector elements is not divisible
|
|
// by the number of SIMD lanes.
|
|
//
|
|
return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
|
|
}
|
|
void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
|
|
int64_t old_top = top_;
|
|
top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
|
|
// Stack overflow check
|
|
ARROW_DCHECK(top_ <= buffer_size_);
|
|
*data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
|
|
// We set 8 bytes before the beginning of the allocated range and
|
|
// 8 bytes after the end to check for stack overflow (which would
|
|
// result in those known bytes being corrupted).
|
|
reinterpret_cast<uint64_t*>(buffer_->mutable_data() + old_top)[0] = kGuard1;
|
|
reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[-1] = kGuard2;
|
|
*id = num_vectors_++;
|
|
}
|
|
void release(int id, uint32_t num_bytes) {
|
|
ARROW_DCHECK(num_vectors_ == id + 1);
|
|
int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
|
|
ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
|
|
kGuard2);
|
|
ARROW_DCHECK(top_ >= size);
|
|
top_ -= size;
|
|
ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
|
|
kGuard1);
|
|
--num_vectors_;
|
|
}
|
|
static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
|
|
static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
|
|
static constexpr int64_t kPadding = 64;
|
|
int num_vectors_;
|
|
int64_t top_;
|
|
std::unique_ptr<Buffer> buffer_;
|
|
int64_t buffer_size_;
|
|
};
|
|
|
|
template <typename T>
|
|
class TempVectorHolder {
|
|
friend class TempVectorStack;
|
|
|
|
public:
|
|
~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
|
|
T* mutable_data() { return reinterpret_cast<T*>(data_); }
|
|
TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
|
|
stack_ = stack;
|
|
num_elements_ = num_elements;
|
|
stack_->alloc(num_elements * sizeof(T), &data_, &id_);
|
|
}
|
|
|
|
private:
|
|
TempVectorStack* stack_;
|
|
uint8_t* data_;
|
|
int id_;
|
|
uint32_t num_elements_;
|
|
};
|
|
|
|
class bit_util {
|
|
public:
|
|
static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
|
|
const int num_bits, const uint8_t* bits, int* num_indexes,
|
|
uint16_t* indexes, int bit_offset = 0);
|
|
|
|
static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
|
|
const int num_bits, const uint8_t* bits,
|
|
const uint16_t* input_indexes, int* num_indexes,
|
|
uint16_t* indexes, int bit_offset = 0);
|
|
|
|
// Input and output indexes may be pointing to the same data (in-place filtering).
|
|
static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
|
|
const uint8_t* bits, int* num_indexes_bit0,
|
|
uint16_t* indexes_bit0, uint16_t* indexes_bit1,
|
|
int bit_offset = 0);
|
|
|
|
// Bit 1 is replaced with byte 0xFF.
|
|
static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
|
|
const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
|
|
|
|
// Return highest bit of each byte.
|
|
static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
|
|
const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
|
|
|
|
static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
|
|
uint32_t num_bytes);
|
|
|
|
private:
|
|
inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
|
|
int* num_indexes, uint16_t* indexes);
|
|
inline static void bits_filter_indexes_helper(uint64_t word,
|
|
const uint16_t* input_indexes,
|
|
int* num_indexes, uint16_t* indexes);
|
|
template <int bit_to_search, bool filter_input_indexes>
|
|
static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
|
|
const uint8_t* bits, const uint16_t* input_indexes,
|
|
int* num_indexes, uint16_t* indexes,
|
|
uint16_t base_index = 0);
|
|
|
|
#if defined(ARROW_HAVE_AVX2)
|
|
static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
|
|
const uint8_t* bits, int* num_indexes,
|
|
uint16_t* indexes, uint16_t base_index = 0);
|
|
static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
|
|
const uint8_t* bits, const uint16_t* input_indexes,
|
|
int* num_indexes, uint16_t* indexes);
|
|
template <int bit_to_search>
|
|
static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
|
|
int* num_indexes, uint16_t* indexes,
|
|
uint16_t base_index = 0);
|
|
template <int bit_to_search>
|
|
static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
|
|
const uint16_t* input_indexes,
|
|
int* num_indexes, uint16_t* indexes);
|
|
static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
|
|
static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
|
|
static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
|
|
#endif
|
|
};
|
|
|
|
} // namespace util
|
|
namespace compute {
|
|
|
|
ARROW_EXPORT
|
|
Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector<ExecNode*>& inputs,
|
|
int expected_num_inputs, const char* kind_name);
|
|
|
|
ARROW_EXPORT
|
|
Result<std::shared_ptr<Table>> TableFromExecBatches(
|
|
const std::shared_ptr<Schema>& schema, const std::vector<ExecBatch>& exec_batches);
|
|
|
|
class AtomicCounter {
|
|
public:
|
|
AtomicCounter() = default;
|
|
|
|
int count() const { return count_.load(); }
|
|
|
|
util::optional<int> total() const {
|
|
int total = total_.load();
|
|
if (total == -1) return {};
|
|
return total;
|
|
}
|
|
|
|
// return true if the counter is complete
|
|
bool Increment() {
|
|
DCHECK_NE(count_.load(), total_.load());
|
|
int count = count_.fetch_add(1) + 1;
|
|
if (count != total_.load()) return false;
|
|
return DoneOnce();
|
|
}
|
|
|
|
// return true if the counter is complete
|
|
bool SetTotal(int total) {
|
|
total_.store(total);
|
|
if (count_.load() != total) return false;
|
|
return DoneOnce();
|
|
}
|
|
|
|
// return true if the counter has not already been completed
|
|
bool Cancel() { return DoneOnce(); }
|
|
|
|
// return true if the counter has finished or been cancelled
|
|
bool Completed() { return complete_.load(); }
|
|
|
|
private:
|
|
// ensure there is only one true return from Increment(), SetTotal(), or Cancel()
|
|
bool DoneOnce() {
|
|
bool expected = false;
|
|
return complete_.compare_exchange_strong(expected, true);
|
|
}
|
|
|
|
std::atomic<int> count_{0}, total_{-1};
|
|
std::atomic<bool> complete_{false};
|
|
};
|
|
|
|
class ThreadIndexer {
|
|
public:
|
|
size_t operator()();
|
|
|
|
static size_t Capacity();
|
|
|
|
private:
|
|
static size_t Check(size_t thread_index);
|
|
|
|
util::Mutex mutex_;
|
|
std::unordered_map<std::thread::id, size_t> id_to_index_;
|
|
};
|
|
|
|
} // namespace compute
|
|
} // namespace arrow
|