mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 10:28:02 +00:00
145 lines
5.0 KiB
Python
145 lines
5.0 KiB
Python
# Copyright 2018-2022 Streamlit Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""A bunch of useful utilities for the watcher.
|
|
|
|
These are functions that only make sense within the watcher. In particular,
|
|
functions that use streamlit.config can go here to avoid a dependency cycle.
|
|
"""
|
|
|
|
import hashlib
|
|
import time
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
# How many times to try to grab the MD5 hash.
|
|
_MAX_RETRIES = 5
|
|
|
|
# How long to wait between retries.
|
|
_RETRY_WAIT_SECS = 0.1
|
|
|
|
|
|
def calc_md5_with_blocking_retries(
|
|
path: str,
|
|
*, # keyword-only arguments:
|
|
glob_pattern: Optional[str] = None,
|
|
allow_nonexistent: bool = False,
|
|
) -> str:
|
|
"""Calculate the MD5 checksum of a given path.
|
|
|
|
For a file, this means calculating the md5 of the file's contents. For a
|
|
directory, we concatenate the directory's path with the names of all the
|
|
files in it and calculate the md5 of that.
|
|
|
|
IMPORTANT: This method calls time.sleep(), which blocks execution. So you
|
|
should only use this outside the main thread.
|
|
"""
|
|
|
|
if allow_nonexistent and not os.path.exists(path):
|
|
content = path.encode("UTF-8")
|
|
elif os.path.isdir(path):
|
|
glob_pattern = glob_pattern or "*"
|
|
content = _stable_dir_identifier(path, glob_pattern).encode("UTF-8")
|
|
else:
|
|
content = _get_file_content_with_blocking_retries(path)
|
|
|
|
md5 = hashlib.md5()
|
|
md5.update(content)
|
|
|
|
# Use hexdigest() instead of digest(), so it's easier to debug.
|
|
return md5.hexdigest()
|
|
|
|
|
|
def path_modification_time(path: str, allow_nonexistent: bool = False) -> float:
|
|
"""Return the modification time of a path (file or directory).
|
|
|
|
If allow_nonexistent is True and the path does not exist, we return 0.0 to
|
|
guarantee that any file/dir later created at the path has a later
|
|
modification time than the last time returned by this function for that
|
|
path.
|
|
|
|
If allow_nonexistent is False and no file/dir exists at the path, a
|
|
FileNotFoundError is raised (by os.stat).
|
|
|
|
For any path that does correspond to an existing file/dir, we return its
|
|
modification time.
|
|
"""
|
|
if allow_nonexistent and not os.path.exists(path):
|
|
return 0.0
|
|
return os.stat(path).st_mtime
|
|
|
|
|
|
def _get_file_content_with_blocking_retries(file_path: str) -> bytes:
|
|
content = b""
|
|
# There's a race condition where sometimes file_path no longer exists when
|
|
# we try to read it (since the file is in the process of being written).
|
|
# So here we retry a few times using this loop. See issue #186.
|
|
for i in range(_MAX_RETRIES):
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
content = f.read()
|
|
break
|
|
except FileNotFoundError as e:
|
|
if i >= _MAX_RETRIES - 1:
|
|
raise e
|
|
time.sleep(_RETRY_WAIT_SECS)
|
|
return content
|
|
|
|
|
|
def _dirfiles(dir_path: str, glob_pattern: str) -> str:
|
|
p = Path(dir_path)
|
|
filenames = sorted(
|
|
[f.name for f in p.glob(glob_pattern) if not f.name.startswith(".")]
|
|
)
|
|
return "+".join(filenames)
|
|
|
|
|
|
def _stable_dir_identifier(dir_path: str, glob_pattern: str) -> str:
|
|
"""Wait for the files in a directory to look stable-ish before returning an id.
|
|
|
|
We do this to deal with problems that would otherwise arise from many tools
|
|
(e.g. git) and editors (e.g. vim) "editing" files (from the user's
|
|
perspective) by doing some combination of deleting, creating, and moving
|
|
various files under the hood.
|
|
|
|
Because of this, we're unable to rely on FileSystemEvents that we receive
|
|
from watchdog to determine when a file has been added to or removed from a
|
|
directory.
|
|
|
|
This is a bit of an unfortunate situation, but the approach we take here is
|
|
most likely fine as:
|
|
* The worst thing that can happen taking this approach is a false
|
|
positive page added/removed notification, which isn't too disastrous
|
|
and can just be ignored.
|
|
* It is impossible (that is, I'm fairly certain that the problem is
|
|
undecidable) to know whether a file created/deleted/moved event
|
|
corresponds to a legitimate file creation/deletion/move or is part of
|
|
some sequence of events that results in what the user sees as a file
|
|
"edit".
|
|
"""
|
|
dirfiles = _dirfiles(dir_path, glob_pattern)
|
|
|
|
for _ in range(_MAX_RETRIES):
|
|
time.sleep(_RETRY_WAIT_SECS)
|
|
|
|
new_dirfiles = _dirfiles(dir_path, glob_pattern)
|
|
if dirfiles == new_dirfiles:
|
|
break
|
|
|
|
dirfiles = new_dirfiles
|
|
|
|
return f"{dir_path}+{dirfiles}"
|