2022-05-23 00:16:32 +04:00

442 lines
16 KiB
Python

# Copyright 2018-2022 Streamlit Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions to marshall a pandas.DataFrame into a proto.DataFrame."""
import datetime
import re
from collections import namedtuple
from typing import cast, Dict, Any, Optional
import pyarrow as pa
import tzlocal
from pandas import DataFrame
from pandas.io.formats.style import Styler
import streamlit
from streamlit import errors, type_util
from streamlit.logger import get_logger
from streamlit.proto.DataFrame_pb2 import (
DataFrame as DataFrameProto,
TableStyle as TableStyleProto,
)
LOGGER = get_logger(__name__)
CSSStyle = namedtuple("CSSStyle", ["property", "value"])
class LegacyDataFrameMixin:
def _legacy_dataframe(self, data=None, width=None, height=None):
"""Display a dataframe as an interactive table.
Parameters
----------
data : pandas.DataFrame, pandas.Styler, numpy.ndarray, Iterable, dict,
or None
The data to display.
If 'data' is a pandas.Styler, it will be used to style its
underyling DataFrame. Streamlit supports custom cell
values and colors. (It does not support some of the more exotic
pandas styling features, like bar charts, hovering, and captions.)
Styler support is experimental!
width : int or None
Desired width of the UI element expressed in pixels. If None, a
default width based on the page width is used.
height : int or None
Desired height of the UI element expressed in pixels. If None, a
default height is used.
Examples
--------
>>> df = pd.DataFrame(
... np.random.randn(50, 20),
... columns=('col %d' % i for i in range(20)))
...
>>> st._legacy_dataframe(df)
.. output::
https://static.streamlit.io/0.25.0-2JkNY/index.html?id=165mJbzWdAC8Duf8a4tjyQ
height: 330px
>>> st._legacy_dataframe(df, 200, 100)
You can also pass a Pandas Styler object to change the style of
the rendered DataFrame:
>>> df = pd.DataFrame(
... np.random.randn(10, 20),
... columns=('col %d' % i for i in range(20)))
...
>>> st._legacy_dataframe(df.style.highlight_max(axis=0))
.. output::
https://static.streamlit.io/0.29.0-dV1Y/index.html?id=Hb6UymSNuZDzojUNybzPby
height: 285px
"""
data_frame_proto = DataFrameProto()
marshall_data_frame(data, data_frame_proto)
return self.dg._enqueue(
"data_frame",
data_frame_proto,
element_width=width,
element_height=height,
)
def _legacy_table(self, data=None):
"""Display a static table.
This differs from `st._legacy_dataframe` in that the table in this case is
static: its entire contents are laid out directly on the page.
Parameters
----------
data : pandas.DataFrame, pandas.Styler, numpy.ndarray, Iterable, dict,
or None
The table data.
Example
-------
>>> df = pd.DataFrame(
... np.random.randn(10, 5),
... columns=('col %d' % i for i in range(5)))
...
>>> st._legacy_table(df)
.. output::
https://static.streamlit.io/0.25.0-2JkNY/index.html?id=KfZvDMprL4JFKXbpjD3fpq
height: 480px
"""
table_proto = DataFrameProto()
marshall_data_frame(data, table_proto)
return self.dg._enqueue("table", table_proto)
@property
def dg(self) -> "streamlit.delta_generator.DeltaGenerator":
"""Get our DeltaGenerator."""
return cast("streamlit.delta_generator.DeltaGenerator", self)
def marshall_data_frame(data: Any, proto_df: DataFrameProto) -> None:
"""Convert a pandas.DataFrame into a proto.DataFrame.
Parameters
----------
data : pandas.DataFrame, numpy.ndarray, Iterable, dict, DataFrame, Styler, or None
Something that is or can be converted to a dataframe.
proto_df : proto.DataFrame
Output. The protobuf for a Streamlit DataFrame proto.
"""
if isinstance(data, pa.Table):
raise errors.StreamlitAPIException(
"""
pyarrow tables are not supported by Streamlit's legacy DataFrame serialization (i.e. with `config.dataFrameSerialization = "legacy"`).
To be able to use pyarrow tables, please enable pyarrow by changing the config setting,
`config.dataFrameSerialization = "arrow"`
"""
)
df = type_util.convert_anything_to_df(data)
# Convert df into an iterable of columns (each of type Series).
df_data = (df.iloc[:, col] for col in range(len(df.columns)))
_marshall_table(df_data, proto_df.data)
_marshall_index(df.columns, proto_df.columns)
_marshall_index(df.index, proto_df.index)
styler = data if type_util.is_pandas_styler(data) else None
_marshall_styles(proto_df.style, df, styler)
def _marshall_styles(
proto_table_style: TableStyleProto, df: DataFrame, styler: Optional[Styler] = None
) -> None:
"""Adds pandas.Styler styling data to a proto.DataFrame
Parameters
----------
proto_table_style : proto.TableStyle
df : pandas.DataFrame
styler : pandas.Styler holding styling data for the data frame, or
None if there's no style data to marshall
"""
# NB: we're using protected members of Styler to get this data,
# which is non-ideal and could break if Styler's interface changes.
if styler is not None:
styler._compute()
# In Pandas 1.3.0, styler._translate() signature was changed.
# 2 arguments were added: sparse_index and sparse_columns.
# The functionality that they provide is not yet supported.
if type_util.is_pandas_version_less_than("1.3.0"):
translated_style = styler._translate()
else:
translated_style = styler._translate(False, False)
css_styles = _get_css_styles(translated_style)
display_values = _get_custom_display_values(translated_style)
else:
# If we have no Styler, we just make an empty CellStyle for each cell
css_styles = {}
display_values = {}
nrows, ncols = df.shape
for col in range(ncols):
proto_col = proto_table_style.cols.add()
for row in range(nrows):
proto_cell_style = proto_col.styles.add()
for css in css_styles.get((row, col), []):
proto_css = proto_cell_style.css.add()
proto_css.property = css.property
proto_css.value = css.value
display_value = display_values.get((row, col), None)
if display_value is not None:
proto_cell_style.display_value = display_value
proto_cell_style.has_display_value = True
def _get_css_styles(translated_style: Dict[Any, Any]) -> Dict[Any, Any]:
"""Parses pandas.Styler style dictionary into a
{(row, col): [CSSStyle]} dictionary
"""
# In pandas < 1.1.0
# translated_style["cellstyle"] has the following shape:
# [
# {
# "props": [["color", " black"], ["background-color", "orange"], ["", ""]],
# "selector": "row0_col0"
# }
# ...
# ]
#
# In pandas >= 1.1.0
# translated_style["cellstyle"] has the following shape:
# [
# {
# "props": [("color", " black"), ("background-color", "orange"), ("", "")],
# "selectors": ["row0_col0"]
# }
# ...
# ]
cell_selector_regex = re.compile(r"row(\d+)_col(\d+)")
css_styles = {}
for cell_style in translated_style["cellstyle"]:
if type_util.is_pandas_version_less_than("1.1.0"):
cell_selectors = [cell_style["selector"]]
else:
cell_selectors = cell_style["selectors"]
for cell_selector in cell_selectors:
match = cell_selector_regex.match(cell_selector)
if not match:
raise RuntimeError(
f'Failed to parse cellstyle selector "{cell_selector}"'
)
row = int(match.group(1))
col = int(match.group(2))
css_declarations = []
props = cell_style["props"]
for prop in props:
if not isinstance(prop, (tuple, list)) or len(prop) != 2:
raise RuntimeError(f'Unexpected cellstyle props "{prop}"')
name = str(prop[0]).strip()
value = str(prop[1]).strip()
if name and value:
css_declarations.append(CSSStyle(property=name, value=value))
css_styles[(row, col)] = css_declarations
return css_styles
def _get_custom_display_values(translated_style: Dict[Any, Any]) -> Dict[Any, Any]:
"""Parses pandas.Styler style dictionary into a
{(row, col): display_value} dictionary for cells whose display format
has been customized.
"""
# Create {(row, col): display_value} from translated_style['body']
# translated_style['body'] has the shape:
# [
# [ // row
# { // cell or header
# 'id': 'level0_row0' (for row header) | 'row0_col0' (for cells)
# 'value': 1.329212
# 'display_value': '132.92%'
# ...
# }
# ]
# ]
def has_custom_display_value(cell: Dict[Any, Any]) -> bool:
# We'd prefer to only pass `display_value` data to the frontend
# when a DataFrame cell has been custom-formatted by the user, to
# save on bandwidth. However:
#
# Panda's Styler's internals are private, and it doesn't give us a
# consistent way of testing whether a cell has a custom display_value
# or not. Prior to Pandas 1.4, we could test whether a cell's
# `display_value` differed from its `value`, and only stick the
# `display_value` in the protobuf when that was the case. In 1.4, an
# unmodified Styler will contain `display_value` strings for all
# cells, regardless of whether any formatting has been applied to
# that cell, so we no longer have this ability.
#
# So we're only testing that a cell's `display_value` is not None.
# In Pandas 1.4, it seems that `display_value` is never None, so this
# is purely a defense against future Styler changes.
return cell.get("display_value") is not None
cell_selector_regex = re.compile(r"row(\d+)_col(\d+)")
header_selector_regex = re.compile(r"level(\d+)_row(\d+)")
display_values = {}
for row in translated_style["body"]:
# row is a List[Dict], containing format data for each cell in the row,
# plus an extra first entry for the row header, which we skip
found_row_header = False
for cell in row:
cell_id = cell["id"] # a string in the form 'row0_col0'
if header_selector_regex.match(cell_id):
if not found_row_header:
# We don't care about processing row headers, but as
# a sanity check, ensure we only see one per row
found_row_header = True
continue
else:
raise RuntimeError('Found unexpected row header "%s"' % cell)
match = cell_selector_regex.match(cell_id)
if not match:
raise RuntimeError('Failed to parse cell selector "%s"' % cell_id)
if has_custom_display_value(cell):
row = int(match.group(1))
col = int(match.group(2))
display_values[(row, col)] = str(cell["display_value"])
return display_values
def _marshall_index(pandas_index, proto_index):
"""Convert an pandas.Index into a proto.Index.
pandas_index - Panda.Index or related (input)
proto_index - proto.Index (output)
"""
import pandas as pd
import numpy as np
if type(pandas_index) == pd.Index:
_marshall_any_array(np.array(pandas_index), proto_index.plain_index.data)
elif type(pandas_index) == pd.RangeIndex:
min = pandas_index.min()
max = pandas_index.max()
if pd.isna(min) or pd.isna(max):
proto_index.range_index.start = 0
proto_index.range_index.stop = 0
else:
proto_index.range_index.start = min
proto_index.range_index.stop = max + 1
elif type(pandas_index) == pd.MultiIndex:
for level in pandas_index.levels:
_marshall_index(level, proto_index.multi_index.levels.add())
if hasattr(pandas_index, "codes"):
index_codes = pandas_index.codes
else:
# Deprecated in Pandas 0.24, do don't bother covering.
index_codes = pandas_index.labels # pragma: no cover
for label in index_codes:
proto_index.multi_index.labels.add().data.extend(label)
elif type(pandas_index) == pd.DatetimeIndex:
if pandas_index.tz is None:
current_zone = tzlocal.get_localzone()
pandas_index = pandas_index.tz_localize(current_zone)
proto_index.datetime_index.data.data.extend(
pandas_index.map(datetime.datetime.isoformat)
)
elif type(pandas_index) == pd.TimedeltaIndex:
proto_index.timedelta_index.data.data.extend(pandas_index.astype(np.int64))
elif type(pandas_index) == pd.Int64Index:
proto_index.int_64_index.data.data.extend(pandas_index)
elif type(pandas_index) == pd.Float64Index:
proto_index.float_64_index.data.data.extend(pandas_index)
else:
raise NotImplementedError("Can't handle %s yet." % type(pandas_index))
def _marshall_table(pandas_table, proto_table):
"""Convert a sequence of 1D arrays into proto.Table.
pandas_table - Sequence of 1D arrays which are AnyArray compatible (input).
proto_table - proto.Table (output)
"""
for pandas_array in pandas_table:
if len(pandas_array) == 0:
continue
_marshall_any_array(pandas_array, proto_table.cols.add())
def _marshall_any_array(pandas_array, proto_array):
"""Convert a 1D numpy.Array into a proto.AnyArray.
pandas_array - 1D arrays which is AnyArray compatible (input).
proto_array - proto.AnyArray (output)
"""
import numpy as np
# Convert to np.array as necessary.
if not hasattr(pandas_array, "dtype"):
pandas_array = np.array(pandas_array)
# Only works on 1D arrays.
if len(pandas_array.shape) != 1:
raise ValueError("Array must be 1D.")
# Perform type-conversion based on the array dtype.
if issubclass(pandas_array.dtype.type, np.floating):
proto_array.doubles.data.extend(pandas_array)
elif issubclass(pandas_array.dtype.type, np.timedelta64):
proto_array.timedeltas.data.extend(pandas_array.astype(np.int64))
elif issubclass(pandas_array.dtype.type, np.integer):
proto_array.int64s.data.extend(pandas_array)
elif pandas_array.dtype == np.bool_:
proto_array.int64s.data.extend(pandas_array)
elif pandas_array.dtype == np.object_:
proto_array.strings.data.extend(map(str, pandas_array))
# dtype='string', <class 'pandas.core.arrays.string_.StringDtype'>
# NOTE: StringDtype is considered experimental.
# The implementation and parts of the API may change without warning.
elif pandas_array.dtype.name == "string":
proto_array.strings.data.extend(map(str, pandas_array))
# Setting a timezone changes (dtype, dtype.type) from
# 'datetime64[ns]', <class 'numpy.datetime64'>
# to
# datetime64[ns, UTC], <class 'pandas._libs.tslibs.timestamps.Timestamp'>
elif pandas_array.dtype.name.startswith("datetime64"):
# Just convert straight to ISO 8601, preserving timezone
# awareness/unawareness. The frontend will render it correctly.
proto_array.datetimes.data.extend(pandas_array.map(datetime.datetime.isoformat))
else:
raise NotImplementedError("Dtype %s not understood." % pandas_array.dtype)