# Copyright 2018-2022 Streamlit Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helper functions to marshall a pandas.DataFrame into a proto.DataFrame."""

import datetime
import re
from collections import namedtuple
from typing import cast, Dict, Any, Optional

import pyarrow as pa
import tzlocal
from pandas import DataFrame
from pandas.io.formats.style import Styler

import streamlit
from streamlit import errors, type_util
from streamlit.logger import get_logger
from streamlit.proto.DataFrame_pb2 import (
    DataFrame as DataFrameProto,
    TableStyle as TableStyleProto,
)

LOGGER = get_logger(__name__)

CSSStyle = namedtuple("CSSStyle", ["property", "value"])


class LegacyDataFrameMixin:
    def _legacy_dataframe(self, data=None, width=None, height=None):
        """Display a dataframe as an interactive table.

        Parameters
        ----------
        data : pandas.DataFrame, pandas.Styler, numpy.ndarray, Iterable, dict,
            or None
            The data to display.

            If 'data' is a pandas.Styler, it will be used to style its
            underyling DataFrame. Streamlit supports custom cell
            values and colors. (It does not support some of the more exotic
            pandas styling features, like bar charts, hovering, and captions.)
            Styler support is experimental!
        width : int or None
            Desired width of the UI element expressed in pixels. If None, a
            default width based on the page width is used.
        height : int or None
            Desired height of the UI element expressed in pixels. If None, a
            default height is used.

        Examples
        --------
        >>> df = pd.DataFrame(
        ...    np.random.randn(50, 20),
        ...    columns=('col %d' % i for i in range(20)))
        ...
        >>> st._legacy_dataframe(df)

        .. output::
           https://static.streamlit.io/0.25.0-2JkNY/index.html?id=165mJbzWdAC8Duf8a4tjyQ
           height: 330px

        >>> st._legacy_dataframe(df, 200, 100)

        You can also pass a Pandas Styler object to change the style of
        the rendered DataFrame:

        >>> df = pd.DataFrame(
        ...    np.random.randn(10, 20),
        ...    columns=('col %d' % i for i in range(20)))
        ...
        >>> st._legacy_dataframe(df.style.highlight_max(axis=0))

        .. output::
           https://static.streamlit.io/0.29.0-dV1Y/index.html?id=Hb6UymSNuZDzojUNybzPby
           height: 285px

        """
        data_frame_proto = DataFrameProto()
        marshall_data_frame(data, data_frame_proto)

        return self.dg._enqueue(
            "data_frame",
            data_frame_proto,
            element_width=width,
            element_height=height,
        )

    def _legacy_table(self, data=None):
        """Display a static table.

        This differs from `st._legacy_dataframe` in that the table in this case is
        static: its entire contents are laid out directly on the page.

        Parameters
        ----------
        data : pandas.DataFrame, pandas.Styler, numpy.ndarray, Iterable, dict,
            or None
            The table data.

        Example
        -------
        >>> df = pd.DataFrame(
        ...    np.random.randn(10, 5),
        ...    columns=('col %d' % i for i in range(5)))
        ...
        >>> st._legacy_table(df)

        .. output::
           https://static.streamlit.io/0.25.0-2JkNY/index.html?id=KfZvDMprL4JFKXbpjD3fpq
           height: 480px

        """
        table_proto = DataFrameProto()
        marshall_data_frame(data, table_proto)
        return self.dg._enqueue("table", table_proto)

    @property
    def dg(self) -> "streamlit.delta_generator.DeltaGenerator":
        """Get our DeltaGenerator."""
        return cast("streamlit.delta_generator.DeltaGenerator", self)


def marshall_data_frame(data: Any, proto_df: DataFrameProto) -> None:
    """Convert a pandas.DataFrame into a proto.DataFrame.

    Parameters
    ----------
    data : pandas.DataFrame, numpy.ndarray, Iterable, dict, DataFrame, Styler, or None
        Something that is or can be converted to a dataframe.

    proto_df : proto.DataFrame
        Output. The protobuf for a Streamlit DataFrame proto.
    """
    if isinstance(data, pa.Table):
        raise errors.StreamlitAPIException(
            """
pyarrow tables are not supported  by Streamlit's legacy DataFrame serialization (i.e. with `config.dataFrameSerialization = "legacy"`).

To be able to use pyarrow tables, please enable pyarrow by changing the config setting,
`config.dataFrameSerialization = "arrow"`
"""
        )
    df = type_util.convert_anything_to_df(data)

    # Convert df into an iterable of columns (each of type Series).
    df_data = (df.iloc[:, col] for col in range(len(df.columns)))

    _marshall_table(df_data, proto_df.data)
    _marshall_index(df.columns, proto_df.columns)
    _marshall_index(df.index, proto_df.index)

    styler = data if type_util.is_pandas_styler(data) else None
    _marshall_styles(proto_df.style, df, styler)


def _marshall_styles(
    proto_table_style: TableStyleProto, df: DataFrame, styler: Optional[Styler] = None
) -> None:
    """Adds pandas.Styler styling data to a proto.DataFrame

    Parameters
    ----------
    proto_table_style : proto.TableStyle
    df : pandas.DataFrame
    styler : pandas.Styler holding styling data for the data frame, or
        None if there's no style data to marshall
    """

    # NB: we're using protected members of Styler to get this data,
    # which is non-ideal and could break if Styler's interface changes.

    if styler is not None:
        styler._compute()

        # In Pandas 1.3.0, styler._translate() signature was changed.
        # 2 arguments were added: sparse_index and sparse_columns.
        # The functionality that they provide is not yet supported.
        if type_util.is_pandas_version_less_than("1.3.0"):
            translated_style = styler._translate()
        else:
            translated_style = styler._translate(False, False)

        css_styles = _get_css_styles(translated_style)
        display_values = _get_custom_display_values(translated_style)
    else:
        # If we have no Styler, we just make an empty CellStyle for each cell
        css_styles = {}
        display_values = {}

    nrows, ncols = df.shape
    for col in range(ncols):
        proto_col = proto_table_style.cols.add()
        for row in range(nrows):
            proto_cell_style = proto_col.styles.add()

            for css in css_styles.get((row, col), []):
                proto_css = proto_cell_style.css.add()
                proto_css.property = css.property
                proto_css.value = css.value

            display_value = display_values.get((row, col), None)
            if display_value is not None:
                proto_cell_style.display_value = display_value
                proto_cell_style.has_display_value = True


def _get_css_styles(translated_style: Dict[Any, Any]) -> Dict[Any, Any]:
    """Parses pandas.Styler style dictionary into a
    {(row, col): [CSSStyle]} dictionary
    """
    # In pandas < 1.1.0
    # translated_style["cellstyle"] has the following shape:
    # [
    #   {
    #       "props": [["color", " black"], ["background-color", "orange"], ["", ""]],
    #       "selector": "row0_col0"
    #   }
    #   ...
    # ]
    #
    # In pandas >= 1.1.0
    # translated_style["cellstyle"] has the following shape:
    # [
    #   {
    #       "props": [("color", " black"), ("background-color", "orange"), ("", "")],
    #       "selectors": ["row0_col0"]
    #   }
    #   ...
    # ]

    cell_selector_regex = re.compile(r"row(\d+)_col(\d+)")

    css_styles = {}
    for cell_style in translated_style["cellstyle"]:
        if type_util.is_pandas_version_less_than("1.1.0"):
            cell_selectors = [cell_style["selector"]]
        else:
            cell_selectors = cell_style["selectors"]

        for cell_selector in cell_selectors:
            match = cell_selector_regex.match(cell_selector)
            if not match:
                raise RuntimeError(
                    f'Failed to parse cellstyle selector "{cell_selector}"'
                )
            row = int(match.group(1))
            col = int(match.group(2))
            css_declarations = []
            props = cell_style["props"]
            for prop in props:
                if not isinstance(prop, (tuple, list)) or len(prop) != 2:
                    raise RuntimeError(f'Unexpected cellstyle props "{prop}"')
                name = str(prop[0]).strip()
                value = str(prop[1]).strip()
                if name and value:
                    css_declarations.append(CSSStyle(property=name, value=value))
            css_styles[(row, col)] = css_declarations

    return css_styles


def _get_custom_display_values(translated_style: Dict[Any, Any]) -> Dict[Any, Any]:
    """Parses pandas.Styler style dictionary into a
    {(row, col): display_value} dictionary for cells whose display format
    has been customized.
    """
    # Create {(row, col): display_value} from translated_style['body']
    # translated_style['body'] has the shape:
    # [
    #   [ // row
    #     {  // cell or header
    #       'id': 'level0_row0' (for row header) | 'row0_col0' (for cells)
    #       'value': 1.329212
    #       'display_value': '132.92%'
    #       ...
    #     }
    #   ]
    # ]

    def has_custom_display_value(cell: Dict[Any, Any]) -> bool:
        # We'd prefer to only pass `display_value` data to the frontend
        # when a DataFrame cell has been custom-formatted by the user, to
        # save on bandwidth. However:
        #
        # Panda's Styler's internals are private, and it doesn't give us a
        # consistent way of testing whether a cell has a custom display_value
        # or not. Prior to Pandas 1.4, we could test whether a cell's
        # `display_value` differed from its `value`, and only stick the
        # `display_value` in the protobuf when that was the case. In 1.4, an
        # unmodified Styler will contain `display_value` strings for all
        # cells, regardless of whether any formatting has been applied to
        # that cell, so we no longer have this ability.
        #
        # So we're only testing that a cell's `display_value` is not None.
        # In Pandas 1.4, it seems that `display_value` is never None, so this
        # is purely a defense against future Styler changes.
        return cell.get("display_value") is not None

    cell_selector_regex = re.compile(r"row(\d+)_col(\d+)")
    header_selector_regex = re.compile(r"level(\d+)_row(\d+)")

    display_values = {}
    for row in translated_style["body"]:
        # row is a List[Dict], containing format data for each cell in the row,
        # plus an extra first entry for the row header, which we skip
        found_row_header = False
        for cell in row:
            cell_id = cell["id"]  # a string in the form 'row0_col0'
            if header_selector_regex.match(cell_id):
                if not found_row_header:
                    # We don't care about processing row headers, but as
                    # a sanity check, ensure we only see one per row
                    found_row_header = True
                    continue
                else:
                    raise RuntimeError('Found unexpected row header "%s"' % cell)
            match = cell_selector_regex.match(cell_id)
            if not match:
                raise RuntimeError('Failed to parse cell selector "%s"' % cell_id)

            if has_custom_display_value(cell):
                row = int(match.group(1))
                col = int(match.group(2))
                display_values[(row, col)] = str(cell["display_value"])

    return display_values


def _marshall_index(pandas_index, proto_index):
    """Convert an pandas.Index into a proto.Index.

    pandas_index - Panda.Index or related (input)
    proto_index  - proto.Index (output)
    """
    import pandas as pd
    import numpy as np

    if type(pandas_index) == pd.Index:
        _marshall_any_array(np.array(pandas_index), proto_index.plain_index.data)
    elif type(pandas_index) == pd.RangeIndex:
        min = pandas_index.min()
        max = pandas_index.max()
        if pd.isna(min) or pd.isna(max):
            proto_index.range_index.start = 0
            proto_index.range_index.stop = 0
        else:
            proto_index.range_index.start = min
            proto_index.range_index.stop = max + 1
    elif type(pandas_index) == pd.MultiIndex:
        for level in pandas_index.levels:
            _marshall_index(level, proto_index.multi_index.levels.add())
        if hasattr(pandas_index, "codes"):
            index_codes = pandas_index.codes
        else:
            # Deprecated in Pandas 0.24, do don't bother covering.
            index_codes = pandas_index.labels  # pragma: no cover
        for label in index_codes:
            proto_index.multi_index.labels.add().data.extend(label)
    elif type(pandas_index) == pd.DatetimeIndex:
        if pandas_index.tz is None:
            current_zone = tzlocal.get_localzone()
            pandas_index = pandas_index.tz_localize(current_zone)
        proto_index.datetime_index.data.data.extend(
            pandas_index.map(datetime.datetime.isoformat)
        )
    elif type(pandas_index) == pd.TimedeltaIndex:
        proto_index.timedelta_index.data.data.extend(pandas_index.astype(np.int64))
    elif type(pandas_index) == pd.Int64Index:
        proto_index.int_64_index.data.data.extend(pandas_index)
    elif type(pandas_index) == pd.Float64Index:
        proto_index.float_64_index.data.data.extend(pandas_index)
    else:
        raise NotImplementedError("Can't handle %s yet." % type(pandas_index))


def _marshall_table(pandas_table, proto_table):
    """Convert a sequence of 1D arrays into proto.Table.

    pandas_table - Sequence of 1D arrays which are AnyArray compatible (input).
    proto_table  - proto.Table (output)
    """
    for pandas_array in pandas_table:
        if len(pandas_array) == 0:
            continue
        _marshall_any_array(pandas_array, proto_table.cols.add())


def _marshall_any_array(pandas_array, proto_array):
    """Convert a 1D numpy.Array into a proto.AnyArray.

    pandas_array - 1D arrays which is AnyArray compatible (input).
    proto_array  - proto.AnyArray (output)
    """
    import numpy as np

    # Convert to np.array as necessary.
    if not hasattr(pandas_array, "dtype"):
        pandas_array = np.array(pandas_array)

    # Only works on 1D arrays.
    if len(pandas_array.shape) != 1:
        raise ValueError("Array must be 1D.")

    # Perform type-conversion based on the array dtype.
    if issubclass(pandas_array.dtype.type, np.floating):
        proto_array.doubles.data.extend(pandas_array)
    elif issubclass(pandas_array.dtype.type, np.timedelta64):
        proto_array.timedeltas.data.extend(pandas_array.astype(np.int64))
    elif issubclass(pandas_array.dtype.type, np.integer):
        proto_array.int64s.data.extend(pandas_array)
    elif pandas_array.dtype == np.bool_:
        proto_array.int64s.data.extend(pandas_array)
    elif pandas_array.dtype == np.object_:
        proto_array.strings.data.extend(map(str, pandas_array))
    # dtype='string', <class 'pandas.core.arrays.string_.StringDtype'>
    # NOTE: StringDtype is considered experimental.
    # The implementation and parts of the API may change without warning.
    elif pandas_array.dtype.name == "string":
        proto_array.strings.data.extend(map(str, pandas_array))
    # Setting a timezone changes (dtype, dtype.type) from
    #   'datetime64[ns]', <class 'numpy.datetime64'>
    # to
    #   datetime64[ns, UTC], <class 'pandas._libs.tslibs.timestamps.Timestamp'>
    elif pandas_array.dtype.name.startswith("datetime64"):
        # Just convert straight to ISO 8601, preserving timezone
        # awareness/unawareness. The frontend will render it correctly.
        proto_array.datetimes.data.extend(pandas_array.map(datetime.datetime.isoformat))
    else:
        raise NotImplementedError("Dtype %s not understood." % pandas_array.dtype)