// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // Private header, not to be exported #pragma once #include #include #include #include #include #include #include #include #include #include #include #include "arrow/array/builder_binary.h" #include "arrow/buffer_builder.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #define XXH_INLINE_ALL #include "arrow/vendored/xxhash.h" // IWYU pragma: keep namespace arrow { namespace internal { // XXX would it help to have a 32-bit hash value on large datasets? typedef uint64_t hash_t; // Notes about the choice of a hash function. // - XXH3 is extremely fast on most data sizes, from small to huge; // faster even than HW CRC-based hashing schemes // - our custom hash function for tiny values (< 16 bytes) is still // significantly faster (~30%), at least on this machine and compiler template inline hash_t ComputeStringHash(const void* data, int64_t length); template struct ScalarHelperBase { static bool CompareScalars(Scalar u, Scalar v) { return u == v; } static hash_t ComputeHash(const Scalar& value) { // Generic hash computation for scalars. Simply apply the string hash // to the bit representation of the value. // XXX in the case of FP values, we'd like equal values to have the same hash, // even if they have different bit representations... return ComputeStringHash(&value, sizeof(value)); } }; template struct ScalarHelper : public ScalarHelperBase {}; template struct ScalarHelper::value>> : public ScalarHelperBase { // ScalarHelper specialization for integers static hash_t ComputeHash(const Scalar& value) { // Faster hash computation for integers. // Two of xxhash's prime multipliers (which are chosen for their // bit dispersion properties) static constexpr uint64_t multipliers[] = {11400714785074694791ULL, 14029467366897019727ULL}; // Multiplying by the prime number mixes the low bits into the high bits, // then byte-swapping (which is a single CPU instruction) allows the // combined high and low bits to participate in the initial hash table index. auto h = static_cast(value); return bit_util::ByteSwap(multipliers[AlgNum] * h); } }; template struct ScalarHelper::value>> : public ScalarHelperBase { // ScalarHelper specialization for util::string_view static hash_t ComputeHash(const util::string_view& value) { return ComputeStringHash(value.data(), static_cast(value.size())); } }; template struct ScalarHelper::value>> : public ScalarHelperBase { // ScalarHelper specialization for reals static bool CompareScalars(Scalar u, Scalar v) { if (std::isnan(u)) { // XXX should we do a bit-precise comparison? return std::isnan(v); } return u == v; } }; template hash_t ComputeStringHash(const void* data, int64_t length) { if (ARROW_PREDICT_TRUE(length <= 16)) { // Specialize for small hash strings, as they are quite common as // hash table keys. Even XXH3 isn't quite as fast. auto p = reinterpret_cast(data); auto n = static_cast(length); if (n <= 8) { if (n <= 3) { if (n == 0) { return 1U; } uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1]; return ScalarHelper::ComputeHash(x); } // 4 <= length <= 8 // We can read the string as two overlapping 32-bit ints, apply // different hash functions to each of them in parallel, then XOR // the results uint32_t x, y; hash_t hx, hy; x = util::SafeLoadAs(p + n - 4); y = util::SafeLoadAs(p); hx = ScalarHelper::ComputeHash(x); hy = ScalarHelper::ComputeHash(y); return n ^ hx ^ hy; } // 8 <= length <= 16 // Apply the same principle as above uint64_t x, y; hash_t hx, hy; x = util::SafeLoadAs(p + n - 8); y = util::SafeLoadAs(p); hx = ScalarHelper::ComputeHash(x); hy = ScalarHelper::ComputeHash(y); return n ^ hx ^ hy; } #if XXH3_SECRET_SIZE_MIN != 136 #error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets #endif // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow. // Instead, we use hard-coded random secrets. To maximize cache efficiency, // they reuse the same memory area. static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = { 0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f, 0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24, 0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26, 0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75, 0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce, 0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3, 0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42, 0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1, 0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5, 0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87}; static_assert(AlgNum < 2, "AlgNum too large"); static constexpr auto secret = kXxh3Secrets + AlgNum; return XXH3_64bits_withSecret(data, static_cast(length), secret, XXH3_SECRET_SIZE_MIN); } // XXX add a HashEq struct with both hash and compare functions? // ---------------------------------------------------------------------- // An open-addressing insert-only hash table (no deletes) template class HashTable { public: static constexpr hash_t kSentinel = 0ULL; static constexpr int64_t kLoadFactor = 2UL; struct Entry { hash_t h; Payload payload; // An entry is valid if the hash is different from the sentinel value operator bool() const { return h != kSentinel; } }; HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) { DCHECK_NE(pool, nullptr); // Minimum of 32 elements capacity = std::max(capacity, 32UL); capacity_ = bit_util::NextPower2(capacity); capacity_mask_ = capacity_ - 1; size_ = 0; DCHECK_OK(UpsizeBuffer(capacity_)); } // Lookup with non-linear probing // cmp_func should have signature bool(const Payload*). // Return a (Entry*, found) pair. template std::pair Lookup(hash_t h, CmpFunc&& cmp_func) { auto p = Lookup(h, entries_, capacity_mask_, std::forward(cmp_func)); return {&entries_[p.first], p.second}; } template std::pair Lookup(hash_t h, CmpFunc&& cmp_func) const { auto p = Lookup(h, entries_, capacity_mask_, std::forward(cmp_func)); return {&entries_[p.first], p.second}; } Status Insert(Entry* entry, hash_t h, const Payload& payload) { // Ensure entry is empty before inserting assert(!*entry); entry->h = FixHash(h); entry->payload = payload; ++size_; if (ARROW_PREDICT_FALSE(NeedUpsizing())) { // Resize less frequently since it is expensive return Upsize(capacity_ * kLoadFactor * 2); } return Status::OK(); } uint64_t size() const { return size_; } // Visit all non-empty entries in the table // The visit_func should have signature void(const Entry*) template void VisitEntries(VisitFunc&& visit_func) const { for (uint64_t i = 0; i < capacity_; i++) { const auto& entry = entries_[i]; if (entry) { visit_func(&entry); } } } protected: // NoCompare is for when the value is known not to exist in the table enum CompareKind { DoCompare, NoCompare }; // The workhorse lookup function template std::pair Lookup(hash_t h, const Entry* entries, uint64_t size_mask, CmpFunc&& cmp_func) const { static constexpr uint8_t perturb_shift = 5; uint64_t index, perturb; const Entry* entry; h = FixHash(h); index = h & size_mask; perturb = (h >> perturb_shift) + 1U; while (true) { entry = &entries[index]; if (CompareEntry(h, entry, std::forward(cmp_func))) { // Found return {index, true}; } if (entry->h == kSentinel) { // Empty slot return {index, false}; } // Perturbation logic inspired from CPython's set / dict object. // The goal is that all 64 bits of the unmasked hash value eventually // participate in the probing sequence, to minimize clustering. index = (index + perturb) & size_mask; perturb = (perturb >> perturb_shift) + 1U; } } template bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const { if (CKind == NoCompare) { return false; } else { return entry->h == h && cmp_func(&entry->payload); } } bool NeedUpsizing() const { // Keep the load factor <= 1/2 return size_ * kLoadFactor >= capacity_; } Status UpsizeBuffer(uint64_t capacity) { RETURN_NOT_OK(entries_builder_.Resize(capacity)); entries_ = entries_builder_.mutable_data(); memset(static_cast(entries_), 0, capacity * sizeof(Entry)); return Status::OK(); } Status Upsize(uint64_t new_capacity) { assert(new_capacity > capacity_); uint64_t new_mask = new_capacity - 1; assert((new_capacity & new_mask) == 0); // it's a power of two // Stash old entries and seal builder, effectively resetting the Buffer const Entry* old_entries = entries_; ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_)); // Allocate new buffer RETURN_NOT_OK(UpsizeBuffer(new_capacity)); for (uint64_t i = 0; i < capacity_; i++) { const auto& entry = old_entries[i]; if (entry) { // Dummy compare function will not be called auto p = Lookup(entry.h, entries_, new_mask, [](const Payload*) { return false; }); // Lookup (and CompareEntry) ensure that an // empty slots is always returned assert(!p.second); entries_[p.first] = entry; } } capacity_ = new_capacity; capacity_mask_ = new_mask; return Status::OK(); } hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; } // The number of slots available in the hash table array. uint64_t capacity_; uint64_t capacity_mask_; // The number of used slots in the hash table array. uint64_t size_; Entry* entries_; TypedBufferBuilder entries_builder_; }; // XXX typedef memo_index_t int32_t ? constexpr int32_t kKeyNotFound = -1; // ---------------------------------------------------------------------- // A base class for memoization table. class MemoTable { public: virtual ~MemoTable() = default; virtual int32_t size() const = 0; }; // ---------------------------------------------------------------------- // A memoization table for memory-cheap scalar values. // The memoization table remembers and allows to look up the insertion // index for each key. template class HashTableTemplateType = HashTable> class ScalarMemoTable : public MemoTable { public: explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0) : hash_table_(pool, static_cast(entries)) {} int32_t Get(const Scalar& value) const { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(payload->value, value); }; hash_t h = ComputeHash(value); auto p = hash_table_.Lookup(h, cmp_func); if (p.second) { return p.first->payload.memo_index; } else { return kKeyNotFound; } } template Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found, int32_t* out_memo_index) { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(value, payload->value); }; hash_t h = ComputeHash(value); auto p = hash_table_.Lookup(h, cmp_func); int32_t memo_index; if (p.second) { memo_index = p.first->payload.memo_index; on_found(memo_index); } else { memo_index = size(); RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index})); on_not_found(memo_index); } *out_memo_index = memo_index; return Status::OK(); } Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) { return GetOrInsert( value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); } int32_t GetNull() const { return null_index_; } template int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { int32_t memo_index = GetNull(); if (memo_index != kKeyNotFound) { on_found(memo_index); } else { null_index_ = memo_index = size(); on_not_found(memo_index); } return memo_index; } int32_t GetOrInsertNull() { return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); } // The number of entries in the memo table +1 if null was added. // (which is also 1 + the largest memo index) int32_t size() const override { return static_cast(hash_table_.size()) + (GetNull() != kKeyNotFound); } // Copy values starting from index `start` into `out_data` void CopyValues(int32_t start, Scalar* out_data) const { hash_table_.VisitEntries([=](const HashTableEntry* entry) { int32_t index = entry->payload.memo_index - start; if (index >= 0) { out_data[index] = entry->payload.value; } }); // Zero-initialize the null entry if (null_index_ != kKeyNotFound) { int32_t index = null_index_ - start; if (index >= 0) { out_data[index] = Scalar{}; } } } void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } protected: struct Payload { Scalar value; int32_t memo_index; }; using HashTableType = HashTableTemplateType; using HashTableEntry = typename HashTableType::Entry; HashTableType hash_table_; int32_t null_index_ = kKeyNotFound; hash_t ComputeHash(const Scalar& value) const { return ScalarHelper::ComputeHash(value); } }; // ---------------------------------------------------------------------- // A memoization table for small scalar values, using direct indexing template struct SmallScalarTraits {}; template <> struct SmallScalarTraits { static constexpr int32_t cardinality = 2; static uint32_t AsIndex(bool value) { return value ? 1 : 0; } }; template struct SmallScalarTraits::value>> { using Unsigned = typename std::make_unsigned::type; static constexpr int32_t cardinality = 1U + std::numeric_limits::max(); static uint32_t AsIndex(Scalar value) { return static_cast(value); } }; template class HashTableTemplateType = HashTable> class SmallScalarMemoTable : public MemoTable { public: explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) { std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound); index_to_value_.reserve(cardinality); } int32_t Get(const Scalar value) const { auto value_index = AsIndex(value); return value_to_index_[value_index]; } template Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found, int32_t* out_memo_index) { auto value_index = AsIndex(value); auto memo_index = value_to_index_[value_index]; if (memo_index == kKeyNotFound) { memo_index = static_cast(index_to_value_.size()); index_to_value_.push_back(value); value_to_index_[value_index] = memo_index; DCHECK_LT(memo_index, cardinality + 1); on_not_found(memo_index); } else { on_found(memo_index); } *out_memo_index = memo_index; return Status::OK(); } Status GetOrInsert(const Scalar value, int32_t* out_memo_index) { return GetOrInsert( value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); } int32_t GetNull() const { return value_to_index_[cardinality]; } template int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { auto memo_index = GetNull(); if (memo_index == kKeyNotFound) { memo_index = value_to_index_[cardinality] = size(); index_to_value_.push_back(0); on_not_found(memo_index); } else { on_found(memo_index); } return memo_index; } int32_t GetOrInsertNull() { return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); } // The number of entries in the memo table // (which is also 1 + the largest memo index) int32_t size() const override { return static_cast(index_to_value_.size()); } // Copy values starting from index `start` into `out_data` void CopyValues(int32_t start, Scalar* out_data) const { DCHECK_GE(start, 0); DCHECK_LE(static_cast(start), index_to_value_.size()); int64_t offset = start * static_cast(sizeof(Scalar)); memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar)); } void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } const std::vector& values() const { return index_to_value_; } protected: static constexpr auto cardinality = SmallScalarTraits::cardinality; static_assert(cardinality <= 256, "cardinality too large for direct-addressed table"); uint32_t AsIndex(Scalar value) const { return SmallScalarTraits::AsIndex(value); } // The last index is reserved for the null element. int32_t value_to_index_[cardinality + 1]; std::vector index_to_value_; }; // ---------------------------------------------------------------------- // A memoization table for variable-sized binary data. template class BinaryMemoTable : public MemoTable { public: using builder_offset_type = typename BinaryBuilderT::offset_type; explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0, int64_t values_size = -1) : hash_table_(pool, static_cast(entries)), binary_builder_(pool) { const int64_t data_size = (values_size < 0) ? entries * 4 : values_size; DCHECK_OK(binary_builder_.Resize(entries)); DCHECK_OK(binary_builder_.ReserveData(data_size)); } int32_t Get(const void* data, builder_offset_type length) const { hash_t h = ComputeStringHash<0>(data, length); auto p = Lookup(h, data, length); if (p.second) { return p.first->payload.memo_index; } else { return kKeyNotFound; } } int32_t Get(const util::string_view& value) const { return Get(value.data(), static_cast(value.length())); } template Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found, Func2&& on_not_found, int32_t* out_memo_index) { hash_t h = ComputeStringHash<0>(data, length); auto p = Lookup(h, data, length); int32_t memo_index; if (p.second) { memo_index = p.first->payload.memo_index; on_found(memo_index); } else { memo_index = size(); // Insert string value RETURN_NOT_OK(binary_builder_.Append(static_cast(data), length)); // Insert hash entry RETURN_NOT_OK( hash_table_.Insert(const_cast(p.first), h, {memo_index})); on_not_found(memo_index); } *out_memo_index = memo_index; return Status::OK(); } template Status GetOrInsert(const util::string_view& value, Func1&& on_found, Func2&& on_not_found, int32_t* out_memo_index) { return GetOrInsert(value.data(), static_cast(value.length()), std::forward(on_found), std::forward(on_not_found), out_memo_index); } Status GetOrInsert(const void* data, builder_offset_type length, int32_t* out_memo_index) { return GetOrInsert( data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); } Status GetOrInsert(const util::string_view& value, int32_t* out_memo_index) { return GetOrInsert(value.data(), static_cast(value.length()), out_memo_index); } int32_t GetNull() const { return null_index_; } template int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) { int32_t memo_index = GetNull(); if (memo_index == kKeyNotFound) { memo_index = null_index_ = size(); DCHECK_OK(binary_builder_.AppendNull()); on_not_found(memo_index); } else { on_found(memo_index); } return memo_index; } int32_t GetOrInsertNull() { return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {}); } // The number of entries in the memo table // (which is also 1 + the largest memo index) int32_t size() const override { return static_cast(hash_table_.size() + (GetNull() != kKeyNotFound)); } int64_t values_size() const { return binary_builder_.value_data_length(); } // Copy (n + 1) offsets starting from index `start` into `out_data` template void CopyOffsets(int32_t start, Offset* out_data) const { DCHECK_LE(start, size()); const builder_offset_type* offsets = binary_builder_.offsets_data(); const builder_offset_type delta = start < binary_builder_.length() ? offsets[start] : 0; for (int32_t i = start; i < size(); ++i) { const builder_offset_type adjusted_offset = offsets[i] - delta; Offset cast_offset = static_cast(adjusted_offset); assert(static_cast(cast_offset) == adjusted_offset); // avoid truncation *out_data++ = cast_offset; } // Copy last value since BinaryBuilder only materializes it on in Finish() *out_data = static_cast(binary_builder_.value_data_length() - delta); } template void CopyOffsets(Offset* out_data) const { CopyOffsets(0, out_data); } // Copy values starting from index `start` into `out_data` void CopyValues(int32_t start, uint8_t* out_data) const { CopyValues(start, -1, out_data); } // Same as above, but check output size in debug mode void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const { DCHECK_LE(start, size()); // The absolute byte offset of `start` value in the binary buffer. const builder_offset_type offset = binary_builder_.offset(start); const auto length = binary_builder_.value_data_length() - static_cast(offset); if (out_size != -1) { assert(static_cast(length) <= out_size); } auto view = binary_builder_.GetView(start); memcpy(out_data, view.data(), length); } void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); } void CopyValues(int64_t out_size, uint8_t* out_data) const { CopyValues(0, out_size, out_data); } void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size, uint8_t* out_data) const { // This method exists to cope with the fact that the BinaryMemoTable does // not know the fixed width when inserting the null value. The data // buffer hold a zero length string for the null value (if found). // // Thus, the method will properly inject an empty value of the proper width // in the output buffer. // if (start >= size()) { return; } int32_t null_index = GetNull(); if (null_index < start) { // Nothing to skip, proceed as usual. CopyValues(start, out_size, out_data); return; } builder_offset_type left_offset = binary_builder_.offset(start); // Ensure that the data length is exactly missing width_size bytes to fit // in the expected output (n_values * width_size). #ifndef NDEBUG int64_t data_length = values_size() - static_cast(left_offset); assert(data_length + width_size == out_size); ARROW_UNUSED(data_length); #endif auto in_data = binary_builder_.value_data() + left_offset; // The null use 0-length in the data, slice the data in 2 and skip by // width_size in out_data. [part_1][width_size][part_2] auto null_data_offset = binary_builder_.offset(null_index); auto left_size = null_data_offset - left_offset; if (left_size > 0) { memcpy(out_data, in_data + left_offset, left_size); } // Zero-initialize the null entry memset(out_data + left_size, 0, width_size); auto right_size = values_size() - static_cast(null_data_offset); if (right_size > 0) { // skip the null fixed size value. auto out_offset = left_size + width_size; assert(out_data + out_offset + right_size == out_data + out_size); memcpy(out_data + out_offset, in_data + null_data_offset, right_size); } } // Visit the stored values in insertion order. // The visitor function should have the signature `void(util::string_view)` // or `void(const util::string_view&)`. template void VisitValues(int32_t start, VisitFunc&& visit) const { for (int32_t i = start; i < size(); ++i) { visit(binary_builder_.GetView(i)); } } protected: struct Payload { int32_t memo_index; }; using HashTableType = HashTable; using HashTableEntry = typename HashTable::Entry; HashTableType hash_table_; BinaryBuilderT binary_builder_; int32_t null_index_ = kKeyNotFound; std::pair Lookup(hash_t h, const void* data, builder_offset_type length) const { auto cmp_func = [=](const Payload* payload) { util::string_view lhs = binary_builder_.GetView(payload->memo_index); util::string_view rhs(static_cast(data), length); return lhs == rhs; }; return hash_table_.Lookup(h, cmp_func); } }; template struct HashTraits {}; template <> struct HashTraits { using MemoTableType = SmallScalarMemoTable; }; template struct HashTraits> { using c_type = typename T::c_type; using MemoTableType = SmallScalarMemoTable; }; template struct HashTraits::value && !is_8bit_int::value>> { using c_type = typename T::c_type; using MemoTableType = ScalarMemoTable; }; template struct HashTraits::value && !std::is_base_of::value>> { using MemoTableType = BinaryMemoTable; }; template struct HashTraits> { using MemoTableType = BinaryMemoTable; }; template struct HashTraits::value>> { using MemoTableType = BinaryMemoTable; }; template static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table, int64_t start_offset, int64_t* null_count, std::shared_ptr* null_bitmap) { int64_t dict_length = static_cast(memo_table.size()) - start_offset; int64_t null_index = memo_table.GetNull(); *null_count = 0; *null_bitmap = nullptr; if (null_index != kKeyNotFound && null_index >= start_offset) { null_index -= start_offset; *null_count = 1; ARROW_ASSIGN_OR_RAISE(*null_bitmap, internal::BitmapAllButOne(pool, dict_length, null_index)); } return Status::OK(); } struct StringViewHash { // std::hash compatible hasher for use with std::unordered_* // (the std::hash specialization provided by nonstd constructs std::string // temporaries then invokes std::hash against those) hash_t operator()(const util::string_view& value) const { return ComputeStringHash<0>(value.data(), static_cast(value.size())); } }; } // namespace internal } // namespace arrow