mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-21 18:23:35 +00:00
390 lines
12 KiB
Python
390 lines
12 KiB
Python
# Copyright 2018-2022 Streamlit Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Hashing for st.memo and st.singleton."""
|
|
import collections
|
|
import functools
|
|
import hashlib
|
|
import inspect
|
|
import io
|
|
import os
|
|
import pickle
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
import unittest.mock
|
|
import weakref
|
|
from typing import Any, Pattern, Optional, Dict, List
|
|
|
|
from streamlit import type_util
|
|
from streamlit import util
|
|
from streamlit.logger import get_logger
|
|
from streamlit.uploaded_file_manager import UploadedFile
|
|
from .cache_errors import (
|
|
CacheType,
|
|
UnhashableTypeError,
|
|
)
|
|
|
|
_LOGGER = get_logger(__name__)
|
|
|
|
|
|
# If a dataframe has more than this many rows, we consider it large and hash a sample.
|
|
_PANDAS_ROWS_LARGE = 100000
|
|
_PANDAS_SAMPLE_SIZE = 10000
|
|
|
|
|
|
# Similar to dataframes, we also sample large numpy arrays.
|
|
_NP_SIZE_LARGE = 1000000
|
|
_NP_SAMPLE_SIZE = 100000
|
|
|
|
|
|
# Arbitrary item to denote where we found a cycle in a hashed object.
|
|
# This allows us to hash self-referencing lists, dictionaries, etc.
|
|
_CYCLE_PLACEHOLDER = b"streamlit-57R34ML17-hesamagicalponyflyingthroughthesky-CYCLE"
|
|
|
|
|
|
def update_hash(val: Any, hasher, cache_type: CacheType) -> None:
|
|
"""Updates a hashlib hasher with the hash of val.
|
|
|
|
This is the main entrypoint to hashing.py.
|
|
"""
|
|
ch = _CacheFuncHasher(cache_type)
|
|
ch.update(hasher, val)
|
|
|
|
|
|
class _HashStack:
|
|
"""Stack of what has been hashed, for debug and circular reference detection.
|
|
|
|
This internally keeps 1 stack per thread.
|
|
|
|
Internally, this stores the ID of pushed objects rather than the objects
|
|
themselves because otherwise the "in" operator inside __contains__ would
|
|
fail for objects that don't return a boolean for "==" operator. For
|
|
example, arr == 10 where arr is a NumPy array returns another NumPy array.
|
|
This causes the "in" to crash since it expects a boolean.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._stack: collections.OrderedDict[int, List[Any]] = collections.OrderedDict()
|
|
|
|
def __repr__(self) -> str:
|
|
return util.repr_(self)
|
|
|
|
def push(self, val: Any):
|
|
self._stack[id(val)] = val
|
|
|
|
def pop(self):
|
|
self._stack.popitem()
|
|
|
|
def __contains__(self, val: Any):
|
|
return id(val) in self._stack
|
|
|
|
|
|
class _HashStacks:
|
|
"""Stacks of what has been hashed, with at most 1 stack per thread."""
|
|
|
|
def __init__(self):
|
|
self._stacks: weakref.WeakKeyDictionary[
|
|
threading.Thread, _HashStack
|
|
] = weakref.WeakKeyDictionary()
|
|
|
|
def __repr__(self) -> str:
|
|
return util.repr_(self)
|
|
|
|
@property
|
|
def current(self) -> _HashStack:
|
|
current_thread = threading.current_thread()
|
|
|
|
stack = self._stacks.get(current_thread, None)
|
|
|
|
if stack is None:
|
|
stack = _HashStack()
|
|
self._stacks[current_thread] = stack
|
|
|
|
return stack
|
|
|
|
|
|
hash_stacks = _HashStacks()
|
|
|
|
|
|
def _int_to_bytes(i: int) -> bytes:
|
|
num_bytes = (i.bit_length() + 8) // 8
|
|
return i.to_bytes(num_bytes, "little", signed=True)
|
|
|
|
|
|
def _key(obj: Optional[Any]) -> Any:
|
|
"""Return key for memoization."""
|
|
|
|
if obj is None:
|
|
return None
|
|
|
|
def is_simple(obj):
|
|
return (
|
|
isinstance(obj, bytes)
|
|
or isinstance(obj, bytearray)
|
|
or isinstance(obj, str)
|
|
or isinstance(obj, float)
|
|
or isinstance(obj, int)
|
|
or isinstance(obj, bool)
|
|
or obj is None
|
|
)
|
|
|
|
if is_simple(obj):
|
|
return obj
|
|
|
|
if isinstance(obj, tuple):
|
|
if all(map(is_simple, obj)):
|
|
return obj
|
|
|
|
if isinstance(obj, list):
|
|
if all(map(is_simple, obj)):
|
|
return ("__l", tuple(obj))
|
|
|
|
if (
|
|
type_util.is_type(obj, "pandas.core.frame.DataFrame")
|
|
or type_util.is_type(obj, "numpy.ndarray")
|
|
or inspect.isbuiltin(obj)
|
|
or inspect.isroutine(obj)
|
|
or inspect.iscode(obj)
|
|
):
|
|
return id(obj)
|
|
|
|
return NoResult
|
|
|
|
|
|
class _CacheFuncHasher:
|
|
"""A hasher that can hash objects with cycles."""
|
|
|
|
def __init__(self, cache_type: CacheType):
|
|
self._hashes: Dict[Any, bytes] = {}
|
|
|
|
# The number of the bytes in the hash.
|
|
self.size = 0
|
|
|
|
self.cache_type = cache_type
|
|
|
|
def __repr__(self) -> str:
|
|
return util.repr_(self)
|
|
|
|
def to_bytes(self, obj: Any) -> bytes:
|
|
"""Add memoization to _to_bytes and protect against cycles in data structures."""
|
|
tname = type(obj).__qualname__.encode()
|
|
key = (tname, _key(obj))
|
|
|
|
# Memoize if possible.
|
|
if key[1] is not NoResult:
|
|
if key in self._hashes:
|
|
return self._hashes[key]
|
|
|
|
# Break recursive cycles.
|
|
if obj in hash_stacks.current:
|
|
return _CYCLE_PLACEHOLDER
|
|
|
|
hash_stacks.current.push(obj)
|
|
|
|
try:
|
|
# Hash the input
|
|
b = b"%s:%s" % (tname, self._to_bytes(obj))
|
|
|
|
# Hmmm... It's possible that the size calculation is wrong. When we
|
|
# call to_bytes inside _to_bytes things get double-counted.
|
|
self.size += sys.getsizeof(b)
|
|
|
|
if key[1] is not NoResult:
|
|
self._hashes[key] = b
|
|
|
|
finally:
|
|
# In case an UnhashableTypeError (or other) error is thrown, clean up the
|
|
# stack so we don't get false positives in future hashing calls
|
|
hash_stacks.current.pop()
|
|
|
|
return b
|
|
|
|
def update(self, hasher, obj: Any) -> None:
|
|
"""Update the provided hasher with the hash of an object."""
|
|
b = self.to_bytes(obj)
|
|
hasher.update(b)
|
|
|
|
def _to_bytes(self, obj: Any) -> bytes:
|
|
"""Hash objects to bytes, including code with dependencies.
|
|
|
|
Python's built in `hash` does not produce consistent results across
|
|
runs.
|
|
"""
|
|
|
|
if isinstance(obj, unittest.mock.Mock):
|
|
# Mock objects can appear to be infinitely
|
|
# deep, so we don't try to hash them at all.
|
|
return self.to_bytes(id(obj))
|
|
|
|
elif isinstance(obj, bytes) or isinstance(obj, bytearray):
|
|
return obj
|
|
|
|
elif isinstance(obj, str):
|
|
return obj.encode()
|
|
|
|
elif isinstance(obj, float):
|
|
return self.to_bytes(hash(obj))
|
|
|
|
elif isinstance(obj, int):
|
|
return _int_to_bytes(obj)
|
|
|
|
elif isinstance(obj, (list, tuple)):
|
|
h = hashlib.new("md5")
|
|
for item in obj:
|
|
self.update(h, item)
|
|
return h.digest()
|
|
|
|
elif isinstance(obj, dict):
|
|
h = hashlib.new("md5")
|
|
for item in obj.items():
|
|
self.update(h, item)
|
|
return h.digest()
|
|
|
|
elif obj is None:
|
|
return b"0"
|
|
|
|
elif obj is True:
|
|
return b"1"
|
|
|
|
elif obj is False:
|
|
return b"0"
|
|
|
|
elif type_util.is_type(obj, "pandas.core.frame.DataFrame") or type_util.is_type(
|
|
obj, "pandas.core.series.Series"
|
|
):
|
|
import pandas as pd
|
|
|
|
if len(obj) >= _PANDAS_ROWS_LARGE:
|
|
obj = obj.sample(n=_PANDAS_SAMPLE_SIZE, random_state=0)
|
|
try:
|
|
return b"%s" % pd.util.hash_pandas_object(obj).sum()
|
|
except TypeError:
|
|
# Use pickle if pandas cannot hash the object for example if
|
|
# it contains unhashable objects.
|
|
return b"%s" % pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
|
|
|
|
elif type_util.is_type(obj, "numpy.ndarray"):
|
|
h = hashlib.new("md5")
|
|
self.update(h, obj.shape)
|
|
|
|
if obj.size >= _NP_SIZE_LARGE:
|
|
import numpy as np
|
|
|
|
state = np.random.RandomState(0)
|
|
obj = state.choice(obj.flat, size=_NP_SAMPLE_SIZE)
|
|
|
|
self.update(h, obj.tobytes())
|
|
return h.digest()
|
|
|
|
elif inspect.isbuiltin(obj):
|
|
return bytes(obj.__name__.encode())
|
|
|
|
elif type_util.is_type(obj, "builtins.mappingproxy") or type_util.is_type(
|
|
obj, "builtins.dict_items"
|
|
):
|
|
return self.to_bytes(dict(obj))
|
|
|
|
elif type_util.is_type(obj, "builtins.getset_descriptor"):
|
|
return bytes(obj.__qualname__.encode())
|
|
|
|
elif isinstance(obj, UploadedFile):
|
|
# UploadedFile is a BytesIO (thus IOBase) but has a name.
|
|
# It does not have a timestamp so this must come before
|
|
# temproary files
|
|
h = hashlib.new("md5")
|
|
self.update(h, obj.name)
|
|
self.update(h, obj.tell())
|
|
self.update(h, obj.getvalue())
|
|
return h.digest()
|
|
|
|
elif hasattr(obj, "name") and (
|
|
isinstance(obj, io.IOBase)
|
|
# Handle temporary files used during testing
|
|
or isinstance(obj, tempfile._TemporaryFileWrapper)
|
|
):
|
|
# Hash files as name + last modification date + offset.
|
|
# NB: we're using hasattr("name") to differentiate between
|
|
# on-disk and in-memory StringIO/BytesIO file representations.
|
|
# That means that this condition must come *before* the next
|
|
# condition, which just checks for StringIO/BytesIO.
|
|
h = hashlib.new("md5")
|
|
obj_name = getattr(obj, "name", "wonthappen") # Just to appease MyPy.
|
|
self.update(h, obj_name)
|
|
self.update(h, os.path.getmtime(obj_name))
|
|
self.update(h, obj.tell())
|
|
return h.digest()
|
|
|
|
elif isinstance(obj, Pattern):
|
|
return self.to_bytes([obj.pattern, obj.flags])
|
|
|
|
elif isinstance(obj, io.StringIO) or isinstance(obj, io.BytesIO):
|
|
# Hash in-memory StringIO/BytesIO by their full contents
|
|
# and seek position.
|
|
h = hashlib.new("md5")
|
|
self.update(h, obj.tell())
|
|
self.update(h, obj.getvalue())
|
|
return h.digest()
|
|
|
|
elif type_util.is_type(obj, "numpy.ufunc"):
|
|
# For numpy.remainder, this returns remainder.
|
|
return bytes(obj.__name__.encode())
|
|
|
|
elif inspect.ismodule(obj):
|
|
# TODO: Figure out how to best show this kind of warning to the
|
|
# user. In the meantime, show nothing. This scenario is too common,
|
|
# so the current warning is quite annoying...
|
|
# st.warning(('Streamlit does not support hashing modules. '
|
|
# 'We did not hash `%s`.') % obj.__name__)
|
|
# TODO: Hash more than just the name for internal modules.
|
|
return self.to_bytes(obj.__name__)
|
|
|
|
elif inspect.isclass(obj):
|
|
# TODO: Figure out how to best show this kind of warning to the
|
|
# user. In the meantime, show nothing. This scenario is too common,
|
|
# (e.g. in every "except" statement) so the current warning is
|
|
# quite annoying...
|
|
# st.warning(('Streamlit does not support hashing classes. '
|
|
# 'We did not hash `%s`.') % obj.__name__)
|
|
# TODO: Hash more than just the name of classes.
|
|
return self.to_bytes(obj.__name__)
|
|
|
|
elif isinstance(obj, functools.partial):
|
|
# The return value of functools.partial is not a plain function:
|
|
# it's a callable object that remembers the original function plus
|
|
# the values you pickled into it. So here we need to special-case it.
|
|
h = hashlib.new("md5")
|
|
self.update(h, obj.args)
|
|
self.update(h, obj.func)
|
|
self.update(h, obj.keywords)
|
|
return h.digest()
|
|
|
|
else:
|
|
# As a last resort, hash the output of the object's __reduce__ method
|
|
h = hashlib.new("md5")
|
|
try:
|
|
reduce_data = obj.__reduce__()
|
|
except BaseException as e:
|
|
raise UnhashableTypeError() from e
|
|
|
|
for item in reduce_data:
|
|
self.update(h, item)
|
|
return h.digest()
|
|
|
|
|
|
class NoResult:
|
|
"""Placeholder class for return values when None is meaningful."""
|
|
|
|
pass
|