first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@@ -0,0 +1,421 @@
# flake8: noqa
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
hard_dependencies = ("numpy", "pytz", "dateutil")
missing_dependencies = []
for dependency in hard_dependencies:
try:
__import__(dependency)
except ImportError as e:
missing_dependencies.append(f"{dependency}: {e}")
if missing_dependencies:
raise ImportError(
"Unable to import required dependencies:\n" + "\n".join(missing_dependencies)
)
del hard_dependencies, dependency, missing_dependencies
# numpy compat
from pandas.compat import is_numpy_dev as _is_numpy_dev
try:
from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
except ImportError as err: # pragma: no cover
module = err.name
raise ImportError(
f"C extension: {module} not built. If you want to import "
"pandas from the source directory, you may need to run "
"'python setup.py build_ext --force' to build the C extensions first."
) from err
else:
del _tslib, _lib, _hashtable
from pandas._config import (
get_option,
set_option,
reset_option,
describe_option,
option_context,
options,
)
# let init-time option registration happen
import pandas.core.config_init
from pandas.core.api import (
# dtype
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
Float32Dtype,
Float64Dtype,
CategoricalDtype,
PeriodDtype,
IntervalDtype,
DatetimeTZDtype,
StringDtype,
BooleanDtype,
# missing
NA,
isna,
isnull,
notna,
notnull,
# indexes
Index,
CategoricalIndex,
RangeIndex,
MultiIndex,
IntervalIndex,
TimedeltaIndex,
DatetimeIndex,
PeriodIndex,
IndexSlice,
# tseries
NaT,
Period,
period_range,
Timedelta,
timedelta_range,
Timestamp,
date_range,
bdate_range,
Interval,
interval_range,
DateOffset,
# conversion
to_numeric,
to_datetime,
to_timedelta,
# misc
Flags,
Grouper,
factorize,
unique,
value_counts,
NamedAgg,
array,
Categorical,
set_eng_float_format,
Series,
DataFrame,
)
from pandas.core.arrays.sparse import SparseDtype
from pandas.tseries.api import infer_freq
from pandas.tseries import offsets
from pandas.core.computation.api import eval
from pandas.core.reshape.api import (
concat,
lreshape,
melt,
wide_to_long,
merge,
merge_asof,
merge_ordered,
crosstab,
pivot,
pivot_table,
get_dummies,
cut,
qcut,
)
from pandas import api, arrays, errors, io, plotting, testing, tseries
from pandas.util._print_versions import show_versions
from pandas.io.api import (
# excel
ExcelFile,
ExcelWriter,
read_excel,
# parsers
read_csv,
read_fwf,
read_table,
# pickle
read_pickle,
to_pickle,
# pytables
HDFStore,
read_hdf,
# sql
read_sql,
read_sql_query,
read_sql_table,
# misc
read_clipboard,
read_parquet,
read_orc,
read_feather,
read_gbq,
read_html,
read_xml,
read_json,
read_stata,
read_sas,
read_spss,
)
from pandas.io.json import _json_normalize as json_normalize
from pandas.util._tester import test
# use the closest tagged version if possible
from pandas._version import get_versions
v = get_versions()
__version__ = v.get("closest-tag", v["version"])
__git_version__ = v.get("full-revisionid")
del get_versions, v
# GH 27101
__deprecated_num_index_names = ["Float64Index", "Int64Index", "UInt64Index"]
def __dir__():
# GH43028
# Int64Index etc. are deprecated, but we still want them to be available in the dir.
# Remove in Pandas 2.0, when we remove Int64Index etc. from the code base.
return list(globals().keys()) + __deprecated_num_index_names
def __getattr__(name):
import warnings
if name in __deprecated_num_index_names:
warnings.warn(
f"pandas.{name} is deprecated "
"and will be removed from pandas in a future version. "
"Use pandas.Index with the appropriate dtype instead.",
FutureWarning,
stacklevel=2,
)
from pandas.core.api import Float64Index, Int64Index, UInt64Index
return {
"Float64Index": Float64Index,
"Int64Index": Int64Index,
"UInt64Index": UInt64Index,
}[name]
elif name == "datetime":
warnings.warn(
"The pandas.datetime class is deprecated "
"and will be removed from pandas in a future version. "
"Import from datetime module instead.",
FutureWarning,
stacklevel=2,
)
from datetime import datetime as dt
return dt
elif name == "np":
warnings.warn(
"The pandas.np module is deprecated "
"and will be removed from pandas in a future version. "
"Import numpy directly instead.",
FutureWarning,
stacklevel=2,
)
import numpy as np
return np
elif name in {"SparseSeries", "SparseDataFrame"}:
warnings.warn(
f"The {name} class is removed from pandas. Accessing it from "
"the top-level namespace will also be removed in the next version.",
FutureWarning,
stacklevel=2,
)
return type(name, (), {})
elif name == "SparseArray":
warnings.warn(
"The pandas.SparseArray class is deprecated "
"and will be removed from pandas in a future version. "
"Use pandas.arrays.SparseArray instead.",
FutureWarning,
stacklevel=2,
)
from pandas.core.arrays.sparse import SparseArray as _SparseArray
return _SparseArray
raise AttributeError(f"module 'pandas' has no attribute '{name}'")
# module level doc-string
__doc__ = """
pandas - a powerful data analysis and manipulation library for Python
=====================================================================
**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.
Main Features
-------------
Here are just a few of the things that pandas does well:
- Easy handling of missing data in floating point as well as non-floating
point data.
- Size mutability: columns can be inserted and deleted from DataFrame and
higher dimensional objects
- Automatic and explicit data alignment: objects can be explicitly aligned
to a set of labels, or the user can simply ignore the labels and let
`Series`, `DataFrame`, etc. automatically align the data for you in
computations.
- Powerful, flexible group by functionality to perform split-apply-combine
operations on data sets, for both aggregating and transforming data.
- Make it easy to convert ragged, differently-indexed data in other Python
and NumPy data structures into DataFrame objects.
- Intelligent label-based slicing, fancy indexing, and subsetting of large
data sets.
- Intuitive merging and joining data sets.
- Flexible reshaping and pivoting of data sets.
- Hierarchical labeling of axes (possible to have multiple labels per tick).
- Robust IO tools for loading data from flat files (CSV and delimited),
Excel files, databases, and saving/loading data from the ultrafast HDF5
format.
- Time series-specific functionality: date range generation and frequency
conversion, moving window statistics, date shifting and lagging.
"""
# Use __all__ to let type checkers know what is part of the public API.
# Pandas is not (yet) a py.typed library: the public API is determined
# based on the documentation.
__all__ = [
"BooleanDtype",
"Categorical",
"CategoricalDtype",
"CategoricalIndex",
"DataFrame",
"DateOffset",
"DatetimeIndex",
"DatetimeTZDtype",
"ExcelFile",
"ExcelWriter",
"Flags",
"Float32Dtype",
"Float64Dtype",
"Grouper",
"HDFStore",
"Index",
"IndexSlice",
"Int16Dtype",
"Int32Dtype",
"Int64Dtype",
"Int8Dtype",
"Interval",
"IntervalDtype",
"IntervalIndex",
"MultiIndex",
"NA",
"NaT",
"NamedAgg",
"Period",
"PeriodDtype",
"PeriodIndex",
"RangeIndex",
"Series",
"SparseDtype",
"StringDtype",
"Timedelta",
"TimedeltaIndex",
"Timestamp",
"UInt16Dtype",
"UInt32Dtype",
"UInt64Dtype",
"UInt8Dtype",
"api",
"array",
"arrays",
"bdate_range",
"concat",
"crosstab",
"cut",
"date_range",
"describe_option",
"errors",
"eval",
"factorize",
"get_dummies",
"get_option",
"infer_freq",
"interval_range",
"io",
"isna",
"isnull",
"json_normalize",
"lreshape",
"melt",
"merge",
"merge_asof",
"merge_ordered",
"notna",
"notnull",
"offsets",
"option_context",
"options",
"period_range",
"pivot",
"pivot_table",
"plotting",
"qcut",
"read_clipboard",
"read_csv",
"read_excel",
"read_feather",
"read_fwf",
"read_gbq",
"read_hdf",
"read_html",
"read_json",
"read_orc",
"read_parquet",
"read_pickle",
"read_sas",
"read_spss",
"read_sql",
"read_sql_query",
"read_sql_table",
"read_stata",
"read_table",
"read_xml",
"reset_option",
"set_eng_float_format",
"set_option",
"show_versions",
"test",
"testing",
"timedelta_range",
"to_datetime",
"to_numeric",
"to_pickle",
"to_timedelta",
"tseries",
"unique",
"value_counts",
"wide_to_long",
]

View File

@@ -0,0 +1,28 @@
"""
pandas._config is considered explicitly upstream of everything else in pandas,
should have no intra-pandas dependencies.
importing `dates` and `display` ensures that keys needed by _libs
are initialized.
"""
__all__ = [
"config",
"detect_console_encoding",
"get_option",
"set_option",
"reset_option",
"describe_option",
"option_context",
"options",
]
from pandas._config import config
from pandas._config import dates # noqa:F401
from pandas._config.config import (
describe_option,
get_option,
option_context,
options,
reset_option,
set_option,
)
from pandas._config.display import detect_console_encoding

View File

@@ -0,0 +1,900 @@
"""
The config module holds package-wide configurables and provides
a uniform API for working with them.
Overview
========
This module supports the following requirements:
- options are referenced using keys in dot.notation, e.g. "x.y.option - z".
- keys are case-insensitive.
- functions should accept partial/regex keys, when unambiguous.
- options can be registered by modules at import time.
- options can be registered at init-time (via core.config_init)
- options have a default value, and (optionally) a description and
validation function associated with them.
- options can be deprecated, in which case referencing them
should produce a warning.
- deprecated options can optionally be rerouted to a replacement
so that accessing a deprecated option reroutes to a differently
named option.
- options can be reset to their default value.
- all option can be reset to their default value at once.
- all options in a certain sub - namespace can be reset at once.
- the user can set / get / reset or ask for the description of an option.
- a developer can register and mark an option as deprecated.
- you can register a callback to be invoked when the option value
is set or reset. Changing the stored value is considered misuse, but
is not verboten.
Implementation
==============
- Data is stored using nested dictionaries, and should be accessed
through the provided API.
- "Registered options" and "Deprecated options" have metadata associated
with them, which are stored in auxiliary dictionaries keyed on the
fully-qualified key, e.g. "x.y.z.option".
- the config_init module is imported by the package's __init__.py file.
placing any register_option() calls there will ensure those options
are available as soon as pandas is loaded. If you use register_option
in a module, it will only be available after that module is imported,
which you should be aware of.
- `config_prefix` is a context_manager (for use with the `with` keyword)
which can save developers some typing, see the docstring.
"""
from __future__ import annotations
from contextlib import (
ContextDecorator,
contextmanager,
)
import re
from typing import (
Any,
Callable,
Iterable,
NamedTuple,
cast,
)
import warnings
from pandas._typing import F
class DeprecatedOption(NamedTuple):
key: str
msg: str | None
rkey: str | None
removal_ver: str | None
class RegisteredOption(NamedTuple):
key: str
defval: object
doc: str
validator: Callable[[object], Any] | None
cb: Callable[[str], Any] | None
# holds deprecated option metadata
_deprecated_options: dict[str, DeprecatedOption] = {}
# holds registered option metadata
_registered_options: dict[str, RegisteredOption] = {}
# holds the current values for registered options
_global_config: dict[str, Any] = {}
# keys which have a special meaning
_reserved_keys: list[str] = ["all"]
class OptionError(AttributeError, KeyError):
"""
Exception for pandas.options, backwards compatible with KeyError
checks.
"""
#
# User API
def _get_single_key(pat: str, silent: bool) -> str:
keys = _select_options(pat)
if len(keys) == 0:
if not silent:
_warn_if_deprecated(pat)
raise OptionError(f"No such keys(s): {repr(pat)}")
if len(keys) > 1:
raise OptionError("Pattern matched multiple keys")
key = keys[0]
if not silent:
_warn_if_deprecated(key)
key = _translate_key(key)
return key
def _get_option(pat: str, silent: bool = False):
key = _get_single_key(pat, silent)
# walk the nested dict
root, k = _get_root(key)
return root[k]
def _set_option(*args, **kwargs) -> None:
# must at least 1 arg deal with constraints later
nargs = len(args)
if not nargs or nargs % 2 != 0:
raise ValueError("Must provide an even number of non-keyword arguments")
# default to false
silent = kwargs.pop("silent", False)
if kwargs:
kwarg = list(kwargs.keys())[0]
raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"')
for k, v in zip(args[::2], args[1::2]):
key = _get_single_key(k, silent)
o = _get_registered_option(key)
if o and o.validator:
o.validator(v)
# walk the nested dict
root, k = _get_root(key)
root[k] = v
if o.cb:
if silent:
with warnings.catch_warnings(record=True):
o.cb(key)
else:
o.cb(key)
def _describe_option(pat: str = "", _print_desc: bool = True):
keys = _select_options(pat)
if len(keys) == 0:
raise OptionError("No such keys(s)")
s = "\n".join([_build_option_description(k) for k in keys])
if _print_desc:
print(s)
else:
return s
def _reset_option(pat: str, silent: bool = False) -> None:
keys = _select_options(pat)
if len(keys) == 0:
raise OptionError("No such keys(s)")
if len(keys) > 1 and len(pat) < 4 and pat != "all":
raise ValueError(
"You must specify at least 4 characters when "
"resetting multiple keys, use the special keyword "
'"all" to reset all the options to their default value'
)
for k in keys:
_set_option(k, _registered_options[k].defval, silent=silent)
def get_default_val(pat: str):
key = _get_single_key(pat, silent=True)
return _get_registered_option(key).defval
class DictWrapper:
"""provide attribute-style access to a nested dict"""
def __init__(self, d: dict[str, Any], prefix: str = ""):
object.__setattr__(self, "d", d)
object.__setattr__(self, "prefix", prefix)
def __setattr__(self, key: str, val: Any) -> None:
prefix = object.__getattribute__(self, "prefix")
if prefix:
prefix += "."
prefix += key
# you can't set new keys
# can you can't overwrite subtrees
if key in self.d and not isinstance(self.d[key], dict):
_set_option(prefix, val)
else:
raise OptionError("You can only set the value of existing options")
def __getattr__(self, key: str):
prefix = object.__getattribute__(self, "prefix")
if prefix:
prefix += "."
prefix += key
try:
v = object.__getattribute__(self, "d")[key]
except KeyError as err:
raise OptionError("No such option") from err
if isinstance(v, dict):
return DictWrapper(v, prefix)
else:
return _get_option(prefix)
def __dir__(self) -> Iterable[str]:
return list(self.d.keys())
# For user convenience, we'd like to have the available options described
# in the docstring. For dev convenience we'd like to generate the docstrings
# dynamically instead of maintaining them by hand. To this, we use the
# class below which wraps functions inside a callable, and converts
# __doc__ into a property function. The doctsrings below are templates
# using the py2.6+ advanced formatting syntax to plug in a concise list
# of options, and option descriptions.
class CallableDynamicDoc:
def __init__(self, func, doc_tmpl):
self.__doc_tmpl__ = doc_tmpl
self.__func__ = func
def __call__(self, *args, **kwds):
return self.__func__(*args, **kwds)
@property
def __doc__(self):
opts_desc = _describe_option("all", _print_desc=False)
opts_list = pp_options_list(list(_registered_options.keys()))
return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)
_get_option_tmpl = """
get_option(pat)
Retrieves the value of the specified option.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp which should match a single option.
Note: partial matches are supported for convenience, but unless you use the
full option name (e.g. x.y.z.option_name), your code may break in future
versions if new options with similar names are introduced.
Returns
-------
result : the value of the option
Raises
------
OptionError : if no such option exists
Notes
-----
Please reference the :ref:`User Guide <options>` for more information.
The available options with its descriptions:
{opts_desc}
"""
_set_option_tmpl = """
set_option(pat, value)
Sets the value of the specified option.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp which should match a single option.
Note: partial matches are supported for convenience, but unless you use the
full option name (e.g. x.y.z.option_name), your code may break in future
versions if new options with similar names are introduced.
value : object
New value of option.
Returns
-------
None
Raises
------
OptionError if no such option exists
Notes
-----
Please reference the :ref:`User Guide <options>` for more information.
The available options with its descriptions:
{opts_desc}
"""
_describe_option_tmpl = """
describe_option(pat, _print_desc=False)
Prints the description for one or more registered options.
Call with no arguments to get a listing for all registered options.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp pattern. All matching keys will have their description displayed.
_print_desc : bool, default True
If True (default) the description(s) will be printed to stdout.
Otherwise, the description(s) will be returned as a unicode string
(for testing).
Returns
-------
None by default, the description(s) as a unicode string if _print_desc
is False
Notes
-----
Please reference the :ref:`User Guide <options>` for more information.
The available options with its descriptions:
{opts_desc}
"""
_reset_option_tmpl = """
reset_option(pat)
Reset one or more options to their default value.
Pass "all" as argument to reset all options.
Available options:
{opts_list}
Parameters
----------
pat : str/regex
If specified only options matching `prefix*` will be reset.
Note: partial matches are supported for convenience, but unless you
use the full option name (e.g. x.y.z.option_name), your code may break
in future versions if new options with similar names are introduced.
Returns
-------
None
Notes
-----
Please reference the :ref:`User Guide <options>` for more information.
The available options with its descriptions:
{opts_desc}
"""
# bind the functions with their docstrings into a Callable
# and use that as the functions exposed in pd.api
get_option = CallableDynamicDoc(_get_option, _get_option_tmpl)
set_option = CallableDynamicDoc(_set_option, _set_option_tmpl)
reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl)
describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl)
options = DictWrapper(_global_config)
#
# Functions for use by pandas developers, in addition to User - api
class option_context(ContextDecorator):
"""
Context manager to temporarily set options in the `with` statement context.
You need to invoke as ``option_context(pat, val, [(pat, val), ...])``.
Examples
--------
>>> with option_context('display.max_rows', 10, 'display.max_columns', 5):
... pass
"""
def __init__(self, *args):
if len(args) % 2 != 0 or len(args) < 2:
raise ValueError(
"Need to invoke as option_context(pat, val, [(pat, val), ...])."
)
self.ops = list(zip(args[::2], args[1::2]))
def __enter__(self):
self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops]
for pat, val in self.ops:
_set_option(pat, val, silent=True)
def __exit__(self, *args):
if self.undo:
for pat, val in self.undo:
_set_option(pat, val, silent=True)
def register_option(
key: str,
defval: object,
doc: str = "",
validator: Callable[[object], Any] | None = None,
cb: Callable[[str], Any] | None = None,
) -> None:
"""
Register an option in the package-wide pandas config object
Parameters
----------
key : str
Fully-qualified key, e.g. "x.y.option - z".
defval : object
Default value of the option.
doc : str
Description of the option.
validator : Callable, optional
Function of a single argument, should raise `ValueError` if
called with a value which is not a legal value for the option.
cb
a function of a single argument "key", which is called
immediately after an option value is set/reset. key is
the full name of the option.
Raises
------
ValueError if `validator` is specified and `defval` is not a valid value.
"""
import keyword
import tokenize
key = key.lower()
if key in _registered_options:
raise OptionError(f"Option '{key}' has already been registered")
if key in _reserved_keys:
raise OptionError(f"Option '{key}' is a reserved key")
# the default value should be legal
if validator:
validator(defval)
# walk the nested dict, creating dicts as needed along the path
path = key.split(".")
for k in path:
if not re.match("^" + tokenize.Name + "$", k):
raise ValueError(f"{k} is not a valid identifier")
if keyword.iskeyword(k):
raise ValueError(f"{k} is a python keyword")
cursor = _global_config
msg = "Path prefix to option '{option}' is already an option"
for i, p in enumerate(path[:-1]):
if not isinstance(cursor, dict):
raise OptionError(msg.format(option=".".join(path[:i])))
if p not in cursor:
cursor[p] = {}
cursor = cursor[p]
if not isinstance(cursor, dict):
raise OptionError(msg.format(option=".".join(path[:-1])))
cursor[path[-1]] = defval # initialize
# save the option metadata
_registered_options[key] = RegisteredOption(
key=key, defval=defval, doc=doc, validator=validator, cb=cb
)
def deprecate_option(
key: str,
msg: str | None = None,
rkey: str | None = None,
removal_ver: str | None = None,
) -> None:
"""
Mark option `key` as deprecated, if code attempts to access this option,
a warning will be produced, using `msg` if given, or a default message
if not.
if `rkey` is given, any access to the key will be re-routed to `rkey`.
Neither the existence of `key` nor that if `rkey` is checked. If they
do not exist, any subsequence access will fail as usual, after the
deprecation warning is given.
Parameters
----------
key : str
Name of the option to be deprecated.
must be a fully-qualified option name (e.g "x.y.z.rkey").
msg : str, optional
Warning message to output when the key is referenced.
if no message is given a default message will be emitted.
rkey : str, optional
Name of an option to reroute access to.
If specified, any referenced `key` will be
re-routed to `rkey` including set/get/reset.
rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
used by the default message if no `msg` is specified.
removal_ver : str, optional
Specifies the version in which this option will
be removed. used by the default message if no `msg` is specified.
Raises
------
OptionError
If the specified key has already been deprecated.
"""
key = key.lower()
if key in _deprecated_options:
raise OptionError(f"Option '{key}' has already been defined as deprecated.")
_deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
#
# functions internal to the module
def _select_options(pat: str) -> list[str]:
"""
returns a list of keys matching `pat`
if pat=="all", returns all registered options
"""
# short-circuit for exact key
if pat in _registered_options:
return [pat]
# else look through all of them
keys = sorted(_registered_options.keys())
if pat == "all": # reserved key
return keys
return [k for k in keys if re.search(pat, k, re.I)]
def _get_root(key: str) -> tuple[dict[str, Any], str]:
path = key.split(".")
cursor = _global_config
for p in path[:-1]:
cursor = cursor[p]
return cursor, path[-1]
def _is_deprecated(key: str) -> bool:
"""Returns True if the given option has been deprecated"""
key = key.lower()
return key in _deprecated_options
def _get_deprecated_option(key: str):
"""
Retrieves the metadata for a deprecated option, if `key` is deprecated.
Returns
-------
DeprecatedOption (namedtuple) if key is deprecated, None otherwise
"""
try:
d = _deprecated_options[key]
except KeyError:
return None
else:
return d
def _get_registered_option(key: str):
"""
Retrieves the option metadata if `key` is a registered option.
Returns
-------
RegisteredOption (namedtuple) if key is deprecated, None otherwise
"""
return _registered_options.get(key)
def _translate_key(key: str) -> str:
"""
if key id deprecated and a replacement key defined, will return the
replacement key, otherwise returns `key` as - is
"""
d = _get_deprecated_option(key)
if d:
return d.rkey or key
else:
return key
def _warn_if_deprecated(key: str) -> bool:
"""
Checks if `key` is a deprecated option and if so, prints a warning.
Returns
-------
bool - True if `key` is deprecated, False otherwise.
"""
d = _get_deprecated_option(key)
if d:
if d.msg:
warnings.warn(d.msg, FutureWarning)
else:
msg = f"'{key}' is deprecated"
if d.removal_ver:
msg += f" and will be removed in {d.removal_ver}"
if d.rkey:
msg += f", please use '{d.rkey}' instead."
else:
msg += ", please refrain from using it."
warnings.warn(msg, FutureWarning)
return True
return False
def _build_option_description(k: str) -> str:
"""Builds a formatted description of a registered option and prints it"""
o = _get_registered_option(k)
d = _get_deprecated_option(k)
s = f"{k} "
if o.doc:
s += "\n".join(o.doc.strip().split("\n"))
else:
s += "No description available."
if o:
s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]"
if d:
rkey = d.rkey or ""
s += "\n (Deprecated"
s += f", use `{rkey}` instead."
s += ")"
return s
def pp_options_list(keys: Iterable[str], width=80, _print: bool = False):
"""Builds a concise listing of available options, grouped by prefix"""
from itertools import groupby
from textwrap import wrap
def pp(name: str, ks: Iterable[str]) -> list[str]:
pfx = "- " + name + ".[" if name else ""
ls = wrap(
", ".join(ks),
width,
initial_indent=pfx,
subsequent_indent=" ",
break_long_words=False,
)
if ls and ls[-1] and name:
ls[-1] = ls[-1] + "]"
return ls
ls: list[str] = []
singles = [x for x in sorted(keys) if x.find(".") < 0]
if singles:
ls += pp("", singles)
keys = [x for x in keys if x.find(".") >= 0]
for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]):
ks = [x[len(k) + 1 :] for x in list(g)]
ls += pp(k, ks)
s = "\n".join(ls)
if _print:
print(s)
else:
return s
#
# helpers
@contextmanager
def config_prefix(prefix):
"""
contextmanager for multiple invocations of API with a common prefix
supported API functions: (register / get / set )__option
Warning: This is not thread - safe, and won't work properly if you import
the API functions into your module using the "from x import y" construct.
Example
-------
import pandas._config.config as cf
with cf.config_prefix("display.font"):
cf.register_option("color", "red")
cf.register_option("size", " 5 pt")
cf.set_option(size, " 6 pt")
cf.get_option(size)
...
etc'
will register options "display.font.color", "display.font.size", set the
value of "display.font.size"... and so on.
"""
# Note: reset_option relies on set_option, and on key directly
# it does not fit in to this monkey-patching scheme
global register_option, get_option, set_option, reset_option
def wrap(func: F) -> F:
def inner(key: str, *args, **kwds):
pkey = f"{prefix}.{key}"
return func(pkey, *args, **kwds)
return cast(F, inner)
_register_option = register_option
_get_option = get_option
_set_option = set_option
set_option = wrap(set_option)
get_option = wrap(get_option)
register_option = wrap(register_option)
try:
yield
finally:
set_option = _set_option
get_option = _get_option
register_option = _register_option
# These factories and methods are handy for use as the validator
# arg in register_option
def is_type_factory(_type: type[Any]) -> Callable[[Any], None]:
"""
Parameters
----------
`_type` - a type to be compared against (e.g. type(x) == `_type`)
Returns
-------
validator - a function of a single argument x , which raises
ValueError if type(x) is not equal to `_type`
"""
def inner(x) -> None:
if type(x) != _type:
raise ValueError(f"Value must have type '{_type}'")
return inner
def is_instance_factory(_type) -> Callable[[Any], None]:
"""
Parameters
----------
`_type` - the type to be checked against
Returns
-------
validator - a function of a single argument x , which raises
ValueError if x is not an instance of `_type`
"""
if isinstance(_type, (tuple, list)):
_type = tuple(_type)
type_repr = "|".join(map(str, _type))
else:
type_repr = f"'{_type}'"
def inner(x) -> None:
if not isinstance(x, _type):
raise ValueError(f"Value must be an instance of {type_repr}")
return inner
def is_one_of_factory(legal_values) -> Callable[[Any], None]:
callables = [c for c in legal_values if callable(c)]
legal_values = [c for c in legal_values if not callable(c)]
def inner(x) -> None:
if x not in legal_values:
if not any(c(x) for c in callables):
uvals = [str(lval) for lval in legal_values]
pp_values = "|".join(uvals)
msg = f"Value must be one of {pp_values}"
if len(callables):
msg += " or a callable"
raise ValueError(msg)
return inner
def is_nonnegative_int(value: object) -> None:
"""
Verify that value is None or a positive int.
Parameters
----------
value : None or int
The `value` to be checked.
Raises
------
ValueError
When the value is not None or is a negative integer
"""
if value is None:
return
elif isinstance(value, int):
if value >= 0:
return
msg = "Value must be a nonnegative integer or None"
raise ValueError(msg)
# common type validators, for convenience
# usage: register_option(... , validator = is_int)
is_int = is_type_factory(int)
is_bool = is_type_factory(bool)
is_float = is_type_factory(float)
is_str = is_type_factory(str)
is_text = is_instance_factory((str, bytes))
def is_callable(obj) -> bool:
"""
Parameters
----------
`obj` - the object to be checked
Returns
-------
validator - returns True if object is callable
raises ValueError otherwise.
"""
if not callable(obj):
raise ValueError("Value must be a callable")
return True

View File

@@ -0,0 +1,23 @@
"""
config for datetime formatting
"""
from pandas._config import config as cf
pc_date_dayfirst_doc = """
: boolean
When True, prints and parses dates with the day first, eg 20/01/2005
"""
pc_date_yearfirst_doc = """
: boolean
When True, prints and parses dates with the year first, eg 2005/01/20
"""
with cf.config_prefix("display"):
# Needed upstream of `_libs` because these are used in tslibs.parsing
cf.register_option(
"date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
)
cf.register_option(
"date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
)

View File

@@ -0,0 +1,62 @@
"""
Unopinionated display configuration.
"""
from __future__ import annotations
import locale
import sys
from pandas._config import config as cf
# -----------------------------------------------------------------------------
# Global formatting options
_initial_defencoding: str | None = None
def detect_console_encoding() -> str:
"""
Try to find the most capable encoding supported by the console.
slightly modified from the way IPython handles the same issue.
"""
global _initial_defencoding
encoding = None
try:
encoding = sys.stdout.encoding or sys.stdin.encoding
except (AttributeError, OSError):
pass
# try again for something better
if not encoding or "ascii" in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except locale.Error:
# can be raised by locale.setlocale(), which is
# called by getpreferredencoding
# (on some systems, see stdlib locale docs)
pass
# when all else fails. this will usually be "ascii"
if not encoding or "ascii" in encoding.lower():
encoding = sys.getdefaultencoding()
# GH#3360, save the reported defencoding at import time
# MPL backends may change it. Make available for debugging.
if not _initial_defencoding:
_initial_defencoding = sys.getdefaultencoding()
return encoding
pc_encoding_doc = """
: str/unicode
Defaults to the detected encoding of the console.
Specifies the encoding to be used for strings returned by to_string,
these are generally strings meant to be displayed on the console.
"""
with cf.config_prefix("display"):
cf.register_option(
"encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
)

View File

@@ -0,0 +1,178 @@
"""
Helpers for configuring locale settings.
Name `localization` is chosen to avoid overlap with builtin `locale` module.
"""
from __future__ import annotations
from contextlib import contextmanager
import locale
import re
import subprocess
from typing import (
Callable,
Iterator,
)
from pandas._config.config import options
@contextmanager
def set_locale(
new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL
) -> Iterator[str | tuple[str, str]]:
"""
Context manager for temporarily setting a locale.
Parameters
----------
new_locale : str or tuple
A string of the form <language_country>.<encoding>. For example to set
the current locale to US English with a UTF8 encoding, you would pass
"en_US.UTF-8".
lc_var : int, default `locale.LC_ALL`
The category of the locale being set.
Notes
-----
This is useful when you want to run a particular block of code under a
particular locale, without globally setting the locale. This probably isn't
thread-safe.
"""
current_locale = locale.getlocale()
try:
locale.setlocale(lc_var, new_locale)
normalized_locale = locale.getlocale()
if all(x is not None for x in normalized_locale):
yield ".".join(normalized_locale)
else:
yield new_locale
finally:
locale.setlocale(lc_var, current_locale)
def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool:
"""
Check to see if we can set a locale, and subsequently get the locale,
without raising an Exception.
Parameters
----------
lc : str
The locale to attempt to set.
lc_var : int, default `locale.LC_ALL`
The category of the locale being set.
Returns
-------
bool
Whether the passed locale can be set
"""
try:
with set_locale(lc, lc_var=lc_var):
pass
except (ValueError, locale.Error):
# horrible name for a Exception subclass
return False
else:
return True
def _valid_locales(locales: list[str] | str, normalize: bool) -> list[str]:
"""
Return a list of normalized locales that do not throw an ``Exception``
when set.
Parameters
----------
locales : str
A string where each locale is separated by a newline.
normalize : bool
Whether to call ``locale.normalize`` on each locale.
Returns
-------
valid_locales : list
A list of valid locales.
"""
return [
loc
for loc in (
locale.normalize(loc.strip()) if normalize else loc.strip()
for loc in locales
)
if can_set_locale(loc)
]
def _default_locale_getter() -> bytes:
return subprocess.check_output(["locale -a"], shell=True)
def get_locales(
prefix: str | None = None,
normalize: bool = True,
locale_getter: Callable[[], bytes] = _default_locale_getter,
) -> list[str] | None:
"""
Get all the locales that are available on the system.
Parameters
----------
prefix : str
If not ``None`` then return only those locales with the prefix
provided. For example to get all English language locales (those that
start with ``"en"``), pass ``prefix="en"``.
normalize : bool
Call ``locale.normalize`` on the resulting list of available locales.
If ``True``, only locales that can be set without throwing an
``Exception`` are returned.
locale_getter : callable
The function to use to retrieve the current locales. This should return
a string with each locale separated by a newline character.
Returns
-------
locales : list of strings
A list of locale strings that can be set with ``locale.setlocale()``.
For example::
locale.setlocale(locale.LC_ALL, locale_string)
On error will return None (no locale available, e.g. Windows)
"""
try:
raw_locales = locale_getter()
except subprocess.CalledProcessError:
# Raised on (some? all?) Windows platforms because Note: "locale -a"
# is not defined
return None
try:
# raw_locales is "\n" separated list of locales
# it may contain non-decodable parts, so split
# extract what we can and then rejoin.
split_raw_locales = raw_locales.split(b"\n")
out_locales = []
for x in split_raw_locales:
try:
out_locales.append(str(x, encoding=options.display.encoding))
except UnicodeError:
# 'locale -a' is used to populated 'raw_locales' and on
# Redhat 7 Linux (and maybe others) prints locale names
# using windows-1252 encoding. Bug only triggered by
# a few special characters and when there is an
# extensive list of installed locales.
out_locales.append(str(x, encoding="windows-1252"))
except TypeError:
pass
if prefix is None:
return _valid_locales(out_locales, normalize)
pattern = re.compile(f"{prefix}.*")
found = pattern.findall("\n".join(out_locales))
return _valid_locales(found, normalize)

View File

@@ -0,0 +1,22 @@
__all__ = [
"NaT",
"NaTType",
"OutOfBoundsDatetime",
"Period",
"Timedelta",
"Timestamp",
"iNaT",
"Interval",
]
from pandas._libs.interval import Interval
from pandas._libs.tslibs import (
NaT,
NaTType,
OutOfBoundsDatetime,
Period,
Timedelta,
Timestamp,
iNaT,
)

View File

@@ -0,0 +1,12 @@
from pandas._libs.dtypes cimport numeric_t
cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil
cdef enum TiebreakEnumType:
TIEBREAK_AVERAGE
TIEBREAK_MIN,
TIEBREAK_MAX
TIEBREAK_FIRST
TIEBREAK_FIRST_DESCENDING
TIEBREAK_DENSE

View File

@@ -0,0 +1,446 @@
from __future__ import annotations
from typing import Any
import numpy as np
from pandas._typing import npt
class Infinity:
"""
Provide a positive Infinity comparison method for ranking.
"""
def __eq__(self, other) -> bool: ...
def __ne__(self, other) -> bool: ...
def __lt__(self, other) -> bool: ...
def __le__(self, other) -> bool: ...
def __gt__(self, other) -> bool: ...
def __ge__(self, other) -> bool: ...
class NegInfinity:
"""
Provide a negative Infinity comparison method for ranking.
"""
def __eq__(self, other) -> bool: ...
def __ne__(self, other) -> bool: ...
def __lt__(self, other) -> bool: ...
def __le__(self, other) -> bool: ...
def __gt__(self, other) -> bool: ...
def __ge__(self, other) -> bool: ...
def unique_deltas(
arr: np.ndarray, # const int64_t[:]
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ...
def groupsort_indexer(
index: np.ndarray, # const int64_t[:]
ngroups: int,
) -> tuple[
np.ndarray, # ndarray[int64_t, ndim=1]
np.ndarray, # ndarray[int64_t, ndim=1]
]: ...
def kth_smallest(
a: np.ndarray, # numeric[:]
k: int,
) -> Any: ... # numeric
# ----------------------------------------------------------------------
# Pairwise correlation/covariance
def nancorr(
mat: npt.NDArray[np.float64], # const float64_t[:, :]
cov: bool = ...,
minp: int | None = ...,
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
def nancorr_spearman(
mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2]
minp: int = ...,
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
# ----------------------------------------------------------------------
# ctypedef fused algos_t:
# float64_t
# float32_t
# object
# int64_t
# int32_t
# int16_t
# int8_t
# uint64_t
# uint32_t
# uint16_t
# uint8_t
def validate_limit(nobs: int | None, limit=...) -> int: ...
def pad(
old: np.ndarray, # ndarray[algos_t]
new: np.ndarray, # ndarray[algos_t]
limit=...,
) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
def pad_inplace(
values: np.ndarray, # algos_t[:]
mask: np.ndarray, # uint8_t[:]
limit=...,
) -> None: ...
def pad_2d_inplace(
values: np.ndarray, # algos_t[:, :]
mask: np.ndarray, # const uint8_t[:, :]
limit=...,
) -> None: ...
def backfill(
old: np.ndarray, # ndarray[algos_t]
new: np.ndarray, # ndarray[algos_t]
limit=...,
) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
def backfill_inplace(
values: np.ndarray, # algos_t[:]
mask: np.ndarray, # uint8_t[:]
limit=...,
) -> None: ...
def backfill_2d_inplace(
values: np.ndarray, # algos_t[:, :]
mask: np.ndarray, # const uint8_t[:, :]
limit=...,
) -> None: ...
def is_monotonic(
arr: np.ndarray, # ndarray[algos_t, ndim=1]
timelike: bool,
) -> tuple[bool, bool, bool]: ...
# ----------------------------------------------------------------------
# rank_1d, rank_2d
# ----------------------------------------------------------------------
# ctypedef fused rank_t:
# object
# float64_t
# uint64_t
# int64_t
def rank_1d(
values: np.ndarray, # ndarray[rank_t, ndim=1]
labels: np.ndarray | None = ..., # const int64_t[:]=None
is_datetimelike: bool = ...,
ties_method=...,
ascending: bool = ...,
pct: bool = ...,
na_option=...,
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
def rank_2d(
in_arr: np.ndarray, # ndarray[rank_t, ndim=2]
axis: int = ...,
is_datetimelike: bool = ...,
ties_method=...,
ascending: bool = ...,
na_option=...,
pct: bool = ...,
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
def diff_2d(
arr: np.ndarray, # ndarray[diff_t, ndim=2]
out: np.ndarray, # ndarray[out_t, ndim=2]
periods: int,
axis: int,
datetimelike: bool = ...,
) -> None: ...
def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ...
def ensure_object(arr: object) -> npt.NDArray[np.object_]: ...
def ensure_complex64(arr: object, copy=...) -> npt.NDArray[np.complex64]: ...
def ensure_complex128(arr: object, copy=...) -> npt.NDArray[np.complex128]: ...
def ensure_float64(arr: object, copy=...) -> npt.NDArray[np.float64]: ...
def ensure_float32(arr: object, copy=...) -> npt.NDArray[np.float32]: ...
def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ...
def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ...
def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ...
def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ...
def ensure_uint8(arr: object, copy=...) -> npt.NDArray[np.uint8]: ...
def ensure_uint16(arr: object, copy=...) -> npt.NDArray[np.uint16]: ...
def ensure_uint32(arr: object, copy=...) -> npt.NDArray[np.uint32]: ...
def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ...
def take_1d_int8_int8(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int8_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int8_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int8_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int16_int16(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int16_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int16_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int16_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int32_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int32_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int64_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_int64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_float32_float32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_float32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_float64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_object_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_bool_bool(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_1d_bool_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int8_int8(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int8_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int8_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int8_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int16_int16(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int16_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int16_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int16_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int32_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int32_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int64_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_int64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_float32_float32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_float32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_float64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_object_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_bool_bool(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis0_bool_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int8_int8(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int8_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int8_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int8_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int16_int16(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int16_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int16_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int16_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int32_int32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int32_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int64_int64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_int64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_float32_float32(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_float32_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_float64_float64(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_object_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_bool_bool(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_axis1_bool_object(
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
) -> None: ...
def take_2d_multi_int8_int8(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int8_int32(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int8_int64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int8_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int16_int16(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int16_int32(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int16_int64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int16_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int32_int32(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int32_int64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int32_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int64_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_float32_float32(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_float32_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_float64_float64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_object_object(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_bool_bool(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_bool_object(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...
def take_2d_multi_int64_int64(
values: np.ndarray,
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
out: np.ndarray,
fill_value=...,
) -> None: ...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,72 @@
"""
Template for each `dtype` helper function using 1-d template
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# ensure_dtype
# ----------------------------------------------------------------------
def ensure_platform_int(object arr):
# GH3033, GH1392
# platform int is the size of the int pointer, e.g. np.intp
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == cnp.NPY_INTP:
return arr
else:
# equiv: arr.astype(np.intp)
return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP)
else:
return np.array(arr, dtype=np.intp)
def ensure_object(object arr):
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == NPY_OBJECT:
return arr
else:
# equiv: arr.astype(object)
return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
else:
return np.array(arr, dtype=np.object_)
{{py:
# name, c_type, dtype
dtypes = [('float64', 'FLOAT64', 'float64'),
# ('float32', 'FLOAT32', 'float32'), # disabling bc unused
('int8', 'INT8', 'int8'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
# Disabling uint and complex dtypes because we do not use them
# (and compiling them increases wheel size)
# ('uint8', 'UINT8', 'uint8'),
# ('uint16', 'UINT16', 'uint16'),
# ('uint32', 'UINT32', 'uint32'),
# ('uint64', 'UINT64', 'uint64'),
# ('complex64', 'COMPLEX64', 'complex64'),
# ('complex128', 'COMPLEX128', 'complex128')
]
def get_dispatch(dtypes):
for name, c_type, dtype in dtypes:
yield name, c_type, dtype
}}
{{for name, c_type, dtype in get_dispatch(dtypes)}}
def ensure_{{name}}(object arr, copy=True):
if util.is_array(arr):
if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
return arr
else:
return arr.astype(np.{{dtype}}, copy=copy)
else:
return np.array(arr, dtype=np.{{dtype}})
{{endfor}}

View File

@@ -0,0 +1,222 @@
"""
Template for each `dtype` helper function for take
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# take_1d, take_2d
# ----------------------------------------------------------------------
{{py:
# c_type_in, c_type_out
dtypes = [
('uint8_t', 'uint8_t'),
('uint8_t', 'object'),
('int8_t', 'int8_t'),
('int8_t', 'int32_t'),
('int8_t', 'int64_t'),
('int8_t', 'float64_t'),
('int16_t', 'int16_t'),
('int16_t', 'int32_t'),
('int16_t', 'int64_t'),
('int16_t', 'float64_t'),
('int32_t', 'int32_t'),
('int32_t', 'int64_t'),
('int32_t', 'float64_t'),
('int64_t', 'int64_t'),
('int64_t', 'float64_t'),
('float32_t', 'float32_t'),
('float32_t', 'float64_t'),
('float64_t', 'float64_t'),
('object', 'object'),
]
def get_dispatch(dtypes):
for (c_type_in, c_type_out) in dtypes:
def get_name(dtype_name):
if dtype_name == "object":
return "object"
if dtype_name == "uint8_t":
return "bool"
return dtype_name[:-2]
name = get_name(c_type_in)
dest = get_name(c_type_out)
args = dict(name=name, dest=dest, c_type_in=c_type_in,
c_type_out=c_type_out)
yield (name, dest, c_type_in, c_type_out)
}}
{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
{{else}}
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
{{endif}}
const intp_t[:] indexer,
{{c_type_out}}[:] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, n, idx
{{c_type_out}} fv
n = indexer.shape[0]
fv = fill_value
{{if c_type_out != "object"}}
with nogil:
{{else}}
if True:
{{endif}}
for i in range(n):
idx = indexer[i]
if idx == -1:
out[i] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i] = True if values[idx] > 0 else False
{{else}}
out[i] = values[idx]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[intp_t, ndim=1] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
{{c_type_out}} fv
{{if c_type_in == c_type_out != "object"}}
const {{c_type_out}} *v
{{c_type_out}} *o
{{endif}}
n = len(indexer)
k = values.shape[1]
fv = fill_value
{{if c_type_in == c_type_out != "object"}}
# GH#3130
if (values.strides[1] == out.strides[1] and
values.strides[1] == sizeof({{c_type_out}}) and
sizeof({{c_type_out}}) * n >= 256):
for i in range(n):
idx = indexer[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
v = &values[idx, 0]
o = &out[i, 0]
memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k))
return
{{endif}}
for i in range(n):
idx = indexer[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
for j in range(k):
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[idx, j] > 0 else False
{{else}}
out[i, j] = values[idx, j]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if c_type_in != "object"}}
def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[intp_t, ndim=1] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
{{c_type_out}} fv
n = len(values)
k = len(indexer)
if n == 0 or k == 0:
return
fv = fill_value
for i in range(n):
for j in range(k):
idx = indexer[j]
if idx == -1:
out[i, j] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[i, idx] > 0 else False
{{else}}
out[i, j] = values[i, idx]
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
indexer,
ndarray[{{c_type_out}}, ndim=2] out,
fill_value=np.nan):
cdef:
Py_ssize_t i, j, k, n, idx
ndarray[intp_t, ndim=1] idx0 = indexer[0]
ndarray[intp_t, ndim=1] idx1 = indexer[1]
{{c_type_out}} fv
n = len(idx0)
k = len(idx1)
fv = fill_value
for i in range(n):
idx = idx0[i]
if idx == -1:
for j in range(k):
out[i, j] = fv
else:
for j in range(k):
if idx1[j] == -1:
out[i, j] = fv
else:
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
out[i, j] = True if values[idx, idx1[j]] > 0 else False
{{else}}
out[i, j] = values[idx, idx1[j]]
{{endif}}
{{endfor}}

View File

@@ -0,0 +1,11 @@
from numpy cimport ndarray
cdef class NDArrayBacked:
cdef:
readonly ndarray _ndarray
readonly object _dtype
cpdef NDArrayBacked _from_backing_data(self, ndarray values)
cpdef __setstate__(self, state)

View File

@@ -0,0 +1,34 @@
from typing import Sequence
import numpy as np
from pandas._typing import (
DtypeObj,
Shape,
)
class NDArrayBacked:
_dtype: DtypeObj
_ndarray: np.ndarray
def __init__(self, values: np.ndarray, dtype: DtypeObj): ...
@classmethod
def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ...
def _from_backing_data(self, values: np.ndarray): ...
def __setstate__(self, state): ...
def __len__(self) -> int: ...
@property
def shape(self) -> Shape: ...
@property
def ndim(self) -> int: ...
@property
def size(self) -> int: ...
@property
def nbytes(self) -> int: ...
def copy(self): ...
def delete(self, loc, axis=...): ...
def swapaxes(self, axis1, axis2): ...
def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
def reshape(self, *args, **kwargs): ...
def ravel(self, order=...): ...
@property
def T(self): ...

View File

@@ -0,0 +1,183 @@
"""
Cython implementations for internal ExtensionArrays.
"""
cimport cython
import numpy as np
cimport numpy as cnp
from cpython cimport PyErr_Clear
from numpy cimport ndarray
cnp.import_array()
@cython.freelist(16)
cdef class NDArrayBacked:
"""
Implementing these methods in cython improves performance quite a bit.
import pandas as pd
from pandas._libs.arrays import NDArrayBacked as cls
dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
arr = dta._ndarray
obj = cls._simple_new(arr, arr.dtype)
# for foo in [arr, dta, obj]: ...
%timeit foo.copy()
299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference)
530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked
1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked
328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__
371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simple_new
%timeit foo.T
125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference)
226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked
911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked
215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simple_new
"""
# TODO: implement take in terms of cnp.PyArray_TakeFrom
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
# cdef:
# readonly ndarray _ndarray
# readonly object _dtype
def __init__(self, ndarray values, object dtype):
self._ndarray = values
self._dtype = dtype
@classmethod
def _simple_new(cls, ndarray values, object dtype):
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(cls)
obj._ndarray = values
obj._dtype = dtype
return obj
cpdef NDArrayBacked _from_backing_data(self, ndarray values):
"""
Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
This should round-trip:
self == self._from_backing_data(self._ndarray)
"""
# TODO: re-reuse simple_new if/when it can be cpdef
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(type(self))
obj._ndarray = values
obj._dtype = self._dtype
return obj
cpdef __setstate__(self, state):
if isinstance(state, dict):
if "_data" in state:
data = state.pop("_data")
elif "_ndarray" in state:
data = state.pop("_ndarray")
else:
raise ValueError # pragma: no cover
self._ndarray = data
self._dtype = state.pop("_dtype")
for key, val in state.items():
setattr(self, key, val)
elif isinstance(state, tuple):
if len(state) != 3:
if len(state) == 1 and isinstance(state[0], dict):
self.__setstate__(state[0])
return
raise NotImplementedError(state) # pragma: no cover
data, dtype = state[:2]
if isinstance(dtype, np.ndarray):
dtype, data = data, dtype
self._ndarray = data
self._dtype = dtype
if isinstance(state[2], dict):
for key, val in state[2].items():
setattr(self, key, val)
else:
raise NotImplementedError(state) # pragma: no cover
else:
raise NotImplementedError(state) # pragma: no cover
def __len__(self) -> int:
return len(self._ndarray)
@property
def shape(self):
# object cast bc _ndarray.shape is npy_intp*
return (<object>(self._ndarray)).shape
@property
def ndim(self) -> int:
return self._ndarray.ndim
@property
def size(self) -> int:
return self._ndarray.size
@property
def nbytes(self) -> int:
return self._ndarray.nbytes
def copy(self, order="C"):
cdef:
cnp.NPY_ORDER order_code
int success
success = cnp.PyArray_OrderConverter(order, &order_code)
if not success:
# clear exception so that we don't get a SystemError
PyErr_Clear()
# same message used by numpy
msg = f"order must be one of 'C', 'F', 'A', or 'K' (got '{order}')"
raise ValueError(msg)
res_values = cnp.PyArray_NewCopy(self._ndarray, order_code)
return self._from_backing_data(res_values)
def delete(self, loc, axis=0):
res_values = np.delete(self._ndarray, loc, axis=axis)
return self._from_backing_data(res_values)
def swapaxes(self, axis1, axis2):
res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
return self._from_backing_data(res_values)
# TODO: pass NPY_MAXDIMS equiv to axis=None?
def repeat(self, repeats, axis: int = 0):
if axis is None:
axis = 0
res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
return self._from_backing_data(res_values)
def reshape(self, *args, **kwargs):
res_values = self._ndarray.reshape(*args, **kwargs)
return self._from_backing_data(res_values)
def ravel(self, order="C"):
# cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
# res_values = cnp.PyArray_Ravel(self._ndarray, order)
res_values = self._ndarray.ravel(order)
return self._from_backing_data(res_values)
@property
def T(self):
res_values = self._ndarray.T
return self._from_backing_data(res_values)
def transpose(self, *axes):
res_values = self._ndarray.transpose(*axes)
return self._from_backing_data(res_values)

View File

@@ -0,0 +1,48 @@
"""
Common location for shared fused types
"""
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
# All numeric types except complex
ctypedef fused numeric_t:
int8_t
int16_t
int32_t
int64_t
uint8_t
uint16_t
uint32_t
uint64_t
float32_t
float64_t
# All numeric types + object, doesn't include complex
ctypedef fused numeric_object_t:
numeric_t
object
# i64 + u64 + all float types
ctypedef fused iu_64_floating_t:
float64_t
float32_t
int64_t
uint64_t
# i64 + u64 + all float types + object
ctypedef fused iu_64_floating_obj_t:
iu_64_floating_t
object

View File

@@ -0,0 +1,159 @@
from typing import Literal
import numpy as np
from pandas._typing import npt
def group_median_float64(
out: np.ndarray, # ndarray[float64_t, ndim=2]
counts: npt.NDArray[np.int64],
values: np.ndarray, # ndarray[float64_t, ndim=2]
labels: npt.NDArray[np.int64],
min_count: int = ..., # Py_ssize_t
) -> None: ...
def group_cumprod_float64(
out: np.ndarray, # float64_t[:, ::1]
values: np.ndarray, # const float64_t[:, :]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
skipna: bool = ...,
) -> None: ...
def group_cumsum(
out: np.ndarray, # numeric[:, ::1]
values: np.ndarray, # ndarray[numeric, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
skipna: bool = ...,
) -> None: ...
def group_shift_indexer(
out: np.ndarray, # int64_t[::1]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
periods: int,
) -> None: ...
def group_fillna_indexer(
out: np.ndarray, # ndarray[intp_t]
labels: np.ndarray, # ndarray[int64_t]
sorted_labels: npt.NDArray[np.intp],
mask: npt.NDArray[np.uint8],
direction: Literal["ffill", "bfill"],
limit: int, # int64_t
dropna: bool,
) -> None: ...
def group_any_all(
out: np.ndarray, # uint8_t[::1]
values: np.ndarray, # const uint8_t[::1]
labels: np.ndarray, # const int64_t[:]
mask: np.ndarray, # const uint8_t[::1]
val_test: Literal["any", "all"],
skipna: bool,
) -> None: ...
def group_add(
out: np.ndarray, # complexfloating_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[complexfloating_t, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
datetimelike: bool = ...,
) -> None: ...
def group_prod(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ..., # Py_ssize_t
ddof: int = ..., # int64_t
) -> None: ...
def group_mean(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ..., # Py_ssize_t
is_datetimelike: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_ohlc(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[floating, ndim=2]
labels: np.ndarray, # const intp_t[:]
min_count: int = ...,
) -> None: ...
def group_quantile(
out: npt.NDArray[np.float64],
values: np.ndarray, # ndarray[numeric, ndim=1]
labels: npt.NDArray[np.intp],
mask: npt.NDArray[np.uint8],
sort_indexer: npt.NDArray[np.intp], # const
qs: npt.NDArray[np.float64], # const
interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
) -> None: ...
def group_last(
out: np.ndarray, # rank_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ..., # Py_ssize_t
) -> None: ...
def group_nth(
out: np.ndarray, # rank_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ..., # int64_t
rank: int = ..., # int64_t
) -> None: ...
def group_rank(
out: np.ndarray, # float64_t[:, ::1]
values: np.ndarray, # ndarray[rank_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
ties_method: Literal["aveage", "min", "max", "first", "dense"] = ...,
ascending: bool = ...,
pct: bool = ...,
na_option: Literal["keep", "top", "bottom"] = ...,
) -> None: ...
def group_max(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
) -> None: ...
def group_cummax(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
) -> None: ...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,9 @@
import numpy as np
from pandas._typing import npt
def hash_object_array(
arr: npt.NDArray[np.object_],
key: str,
encoding: str = ...,
) -> npt.NDArray[np.uint64]: ...

View File

@@ -0,0 +1,198 @@
# Translated from the reference implementation
# at https://github.com/veorq/SipHash
import cython
from libc.stdlib cimport (
free,
malloc,
)
import numpy as np
from numpy cimport (
import_array,
ndarray,
uint8_t,
uint32_t,
uint64_t,
)
import_array()
from pandas._libs.util cimport is_nan
DEF cROUNDS = 2
DEF dROUNDS = 4
@cython.boundscheck(False)
def hash_object_array(
ndarray[object] arr, str key, str encoding="utf8"
) -> np.ndarray[np.uint64]:
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes.
Raises
------
TypeError
If the array contains mixed types.
Notes
-----
Allowed values must be strings, or nulls
mixed array types will raise TypeError.
"""
cdef:
Py_ssize_t i, n
uint64_t[:] result
bytes data, k
uint8_t *kb
uint64_t *lens
char **vecs
char *cdata
object val
list datas = []
k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
)
n = len(arr)
# create an array of bytes
vecs = <char **>malloc(n * sizeof(char *))
lens = <uint64_t*>malloc(n * sizeof(uint64_t))
for i in range(n):
val = arr[i]
if isinstance(val, bytes):
data = <bytes>val
elif isinstance(val, str):
data = <bytes>val.encode(encoding)
elif val is None or is_nan(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)
elif isinstance(val, tuple):
# GH#28969 we could have a tuple, but need to ensure that
# the tuple entries are themselves hashable before converting
# to str
hash(val)
data = <bytes>str(val).encode(encoding)
else:
raise TypeError(
f"{val} of type {type(val)} is not a valid type for hashing, "
"must be string or null"
)
lens[i] = len(data)
cdata = data
# keep the references alive through the end of the
# function
datas.append(data)
vecs[i] = cdata
result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
free(vecs)
free(lens)
return result.base # .base to retrieve underlying np.ndarray
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)
@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte
b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0
while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m
data += sizeof(uint64_t)
for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)
v3 ^= b
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= b
v2 ^= 0xff
for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)
b = v0 ^ v1 ^ v2 ^ v3
return b

View File

@@ -0,0 +1,141 @@
from numpy cimport (
intp_t,
ndarray,
)
from pandas._libs.khash cimport (
complex64_t,
complex128_t,
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
kh_complex64_t,
kh_complex128_t,
kh_float32_t,
kh_float64_t,
kh_int8_t,
kh_int16_t,
kh_int32_t,
kh_int64_t,
kh_pymap_t,
kh_str_t,
kh_uint8_t,
kh_uint16_t,
kh_uint32_t,
kh_uint64_t,
khcomplex64_t,
khcomplex128_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
# prototypes for sharing
cdef class HashTable:
pass
cdef class UInt64HashTable(HashTable):
cdef kh_uint64_t *table
cpdef get_item(self, uint64_t val)
cpdef set_item(self, uint64_t key, Py_ssize_t val)
cdef class Int64HashTable(HashTable):
cdef kh_int64_t *table
cpdef get_item(self, int64_t val)
cpdef set_item(self, int64_t key, Py_ssize_t val)
cdef class UInt32HashTable(HashTable):
cdef kh_uint32_t *table
cpdef get_item(self, uint32_t val)
cpdef set_item(self, uint32_t key, Py_ssize_t val)
cdef class Int32HashTable(HashTable):
cdef kh_int32_t *table
cpdef get_item(self, int32_t val)
cpdef set_item(self, int32_t key, Py_ssize_t val)
cdef class UInt16HashTable(HashTable):
cdef kh_uint16_t *table
cpdef get_item(self, uint16_t val)
cpdef set_item(self, uint16_t key, Py_ssize_t val)
cdef class Int16HashTable(HashTable):
cdef kh_int16_t *table
cpdef get_item(self, int16_t val)
cpdef set_item(self, int16_t key, Py_ssize_t val)
cdef class UInt8HashTable(HashTable):
cdef kh_uint8_t *table
cpdef get_item(self, uint8_t val)
cpdef set_item(self, uint8_t key, Py_ssize_t val)
cdef class Int8HashTable(HashTable):
cdef kh_int8_t *table
cpdef get_item(self, int8_t val)
cpdef set_item(self, int8_t key, Py_ssize_t val)
cdef class Float64HashTable(HashTable):
cdef kh_float64_t *table
cpdef get_item(self, float64_t val)
cpdef set_item(self, float64_t key, Py_ssize_t val)
cdef class Float32HashTable(HashTable):
cdef kh_float32_t *table
cpdef get_item(self, float32_t val)
cpdef set_item(self, float32_t key, Py_ssize_t val)
cdef class Complex64HashTable(HashTable):
cdef kh_complex64_t *table
cpdef get_item(self, complex64_t val)
cpdef set_item(self, complex64_t key, Py_ssize_t val)
cdef class Complex128HashTable(HashTable):
cdef kh_complex128_t *table
cpdef get_item(self, complex128_t val)
cpdef set_item(self, complex128_t key, Py_ssize_t val)
cdef class PyObjectHashTable(HashTable):
cdef kh_pymap_t *table
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef class StringHashTable(HashTable):
cdef kh_str_t *table
cpdef get_item(self, str val)
cpdef set_item(self, str key, Py_ssize_t val)
cdef struct Int64VectorData:
int64_t *data
Py_ssize_t n, m
cdef class Vector:
cdef bint external_view_exists
cdef class Int64Vector(Vector):
cdef Int64VectorData *data
cdef ndarray ao
cdef resize(self)
cpdef ndarray to_array(self)
cdef inline void append(self, int64_t x)
cdef extend(self, int64_t[:] x)

View File

@@ -0,0 +1,213 @@
from typing import (
Hashable,
Literal,
)
import numpy as np
from pandas._typing import npt
def unique_label_indices(
labels: np.ndarray, # const int64_t[:]
) -> np.ndarray: ...
class Factorizer:
count: int
def __init__(self, size_hint: int): ...
def get_count(self) -> int: ...
class ObjectFactorizer(Factorizer):
table: PyObjectHashTable
uniques: ObjectVector
def factorize(
self,
values: npt.NDArray[np.object_],
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> npt.NDArray[np.intp]: ...
class Int64Factorizer(Factorizer):
table: Int64HashTable
uniques: Int64Vector
def factorize(
self,
values: np.ndarray, # const int64_t[:]
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> npt.NDArray[np.intp]: ...
class Int64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.int64]: ...
class Int32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.int32]: ...
class Int16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.int16]: ...
class Int8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.int8]: ...
class UInt64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.uint64]: ...
class UInt32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.uint32]: ...
class UInt16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.uint16]: ...
class UInt8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.uint8]: ...
class Float64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.float64]: ...
class Float32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.float32]: ...
class Complex128Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.complex128]: ...
class Complex64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.complex64]: ...
class StringVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.object_]: ...
class ObjectVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> npt.NDArray[np.object_]: ...
class HashTable:
# NB: The base HashTable class does _not_ actually have these methods;
# we are putting the here for the sake of mypy to avoid
# reproducing them in each subclass below.
def __init__(self, size_hint: int = ...): ...
def __len__(self) -> int: ...
def __contains__(self, key: Hashable) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def get_state(self) -> dict[str, int]: ...
# TODO: `item` type is subclass-specific
def get_item(self, item): ... # TODO: return type?
def set_item(self, item) -> None: ...
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
def map(
self,
keys: np.ndarray, # np.ndarray[subclass-specific]
values: np.ndarray, # const int64_t[:]
) -> None: ...
def map_locations(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> None: ...
def lookup(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> npt.NDArray[np.intp]: ...
def get_labels(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # SubclassTypeVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
) -> npt.NDArray[np.intp]: ...
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
npt.NDArray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]
def _unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # FooVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
ignore_na: bool = ...,
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
npt.NDArray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]
def factorize(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
na_sentinel: int = ...,
na_value: object = ...,
mask=...,
) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ... # np.ndarray[subclass-specific]
class Complex128HashTable(HashTable): ...
class Complex64HashTable(HashTable): ...
class Float64HashTable(HashTable): ...
class Float32HashTable(HashTable): ...
class Int64HashTable(HashTable):
# Only Int64HashTable has get_labels_groupby
def get_labels_groupby(
self,
values: np.ndarray, # const int64_t[:]
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ...
class Int32HashTable(HashTable): ...
class Int16HashTable(HashTable): ...
class Int8HashTable(HashTable): ...
class UInt64HashTable(HashTable): ...
class UInt32HashTable(HashTable): ...
class UInt16HashTable(HashTable): ...
class UInt8HashTable(HashTable): ...
class StringHashTable(HashTable): ...
class PyObjectHashTable(HashTable): ...
class IntpHashTable(HashTable): ...
def duplicated(
values: np.ndarray,
keep: Literal["last", "first", False] = ...,
) -> npt.NDArray[np.bool_]: ...
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
def value_count(
values: np.ndarray,
dropna: bool,
) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values]
# arr and values should have same dtype
def ismember(
arr: np.ndarray,
values: np.ndarray,
) -> npt.NDArray[np.bool_]: ...
def object_hash(obj) -> int: ...
def objects_are_equal(a, b) -> bool: ...

View File

@@ -0,0 +1,182 @@
cimport cython
from cpython.mem cimport (
PyMem_Free,
PyMem_Malloc,
)
from cpython.ref cimport (
Py_INCREF,
PyObject,
)
from libc.stdlib cimport (
free,
malloc,
)
import numpy as np
cimport numpy as cnp
from numpy cimport (
float64_t,
ndarray,
uint8_t,
uint32_t,
)
from numpy.math cimport NAN
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.khash cimport (
KHASH_TRACE_DOMAIN,
are_equivalent_float32_t,
are_equivalent_float64_t,
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_needed_n_buckets,
kh_python_hash_equal,
kh_python_hash_func,
kh_str_t,
khcomplex64_t,
khcomplex128_t,
khiter_t,
)
from pandas._libs.missing cimport checknull
def get_hashtable_trace_domain():
return KHASH_TRACE_DOMAIN
def object_hash(obj):
return kh_python_hash_func(obj)
def objects_are_equal(a, b):
return kh_python_hash_equal(a, b)
cdef int64_t NPY_NAT = util.get_nat()
SIZE_HINT_LIMIT = (1 << 20) + 7
cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"
# map derived hash-map types onto basic hash-map types:
if np.dtype(np.intp) == np.dtype(np.int64):
IntpHashTable = Int64HashTable
unique_label_indices = _unique_label_indices_int64
elif np.dtype(np.intp) == np.dtype(np.int32):
IntpHashTable = Int32HashTable
unique_label_indices = _unique_label_indices_int32
else:
raise ValueError(np.dtype(np.intp))
cdef class Factorizer:
cdef readonly:
Py_ssize_t count
def __cinit__(self, size_hint: int):
self.count = 0
def get_count(self) -> int:
return self.count
cdef class ObjectFactorizer(Factorizer):
cdef public:
PyObjectHashTable table
ObjectVector uniques
def __cinit__(self, size_hint: int):
self.table = PyObjectHashTable(size_hint)
self.uniques = ObjectVector()
def factorize(
self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
) -> np.ndarray:
"""
Returns
-------
np.ndarray[np.intp]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> fac = ObjectFactorizer(3)
>>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels
if self.uniques.external_view_exists:
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, na_value)
mask = (labels == na_sentinel)
# sort on
if sort:
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels, mode='clip')
labels[mask] = na_sentinel
self.count = len(self.uniques)
return labels
cdef class Int64Factorizer(Factorizer):
cdef public:
Int64HashTable table
Int64Vector uniques
def __cinit__(self, size_hint: int):
self.table = Int64HashTable(size_hint)
self.uniques = Int64Vector()
def factorize(self, const int64_t[:] values, sort=False,
na_sentinel=-1, na_value=None) -> np.ndarray:
"""
Returns
-------
ndarray[intp_t]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> fac = Int64Factorizer(3)
>>> fac.factorize(np.array([1,2,3]), na_sentinel=20)
array([0, 1, 2])
"""
cdef:
ndarray[intp_t] labels
if self.uniques.external_view_exists:
uniques = Int64Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
na_value=na_value)
# sort on
if sort:
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
labels = reverse_indexer.take(labels)
self.count = len(self.uniques)
return labels

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,515 @@
"""
Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
{{py:
# name, dtype, ttype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'complex128',
'khcomplex128_t', 'to_khcomplex128_t'),
('Complex64', 'complex64', 'complex64',
'khcomplex64_t', 'to_khcomplex64_t'),
('Float64', 'float64', 'float64', 'float64_t', ''),
('Float32', 'float32', 'float32', 'float32_t', ''),
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
('Object', 'object', 'pymap', 'object', ''),
('Int64', 'int64', 'int64', 'int64_t', ''),
('Int32', 'int32', 'int32', 'int32_t', ''),
('Int16', 'int16', 'int16', 'int16_t', ''),
('Int8', 'int8', 'int8', 'int8_t', '')]
}}
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
{{else}}
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{endif}}
cdef:
Py_ssize_t i = 0
Py_ssize_t n = len(values)
kh_{{ttype}}_t *table
# Don't use Py_ssize_t, since table.n_buckets is unsigned
khiter_t k
{{c_type}} val
int ret = 0
# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
# table maps keys to counts
# result_keys remembers the original order of keys
result_keys = {{name}}Vector()
table = kh_init_{{ttype}}()
{{if dtype == 'object'}}
kh_resize_{{ttype}}(table, n // 10)
for i in range(n):
val = values[i]
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, <PyObject*>val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{else}}
kh_resize_{{ttype}}(table, n)
for i in range(n):
val = {{to_c_type}}(values[i])
if not is_nan_{{c_type}}(val) or not dropna:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{endif}}
# collect counts in the order corresponding to result_keys:
cdef int64_t[:] result_counts = np.empty(table.size, dtype=np.int64)
for i in range(table.size):
{{if dtype == 'object'}}
k = kh_get_{{ttype}}(table, result_keys.data[i])
{{else}}
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
{{endif}}
result_counts[i] = table.vals[k]
kh_destroy_{{ttype}}(table)
return result_keys.to_array(), result_counts.base
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
{{else}}
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
{{endif}}
cdef:
int ret = 0
{{if dtype != 'object'}}
{{c_type}} value
{{endif}}
Py_ssize_t i, n = len(values)
khiter_t k
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')
if keep == 'last':
{{if dtype == 'object'}}
for i in range(n - 1, -1, -1):
# equivalent: range(n)[::-1], which cython doesn't like in nogil
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
out[i] = ret == 0
{{else}}
with nogil:
for i in range(n - 1, -1, -1):
# equivalent: range(n)[::-1], which cython doesn't like in nogil
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0
{{endif}}
elif keep == 'first':
{{if dtype == 'object'}}
for i in range(n):
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
out[i] = ret == 0
{{else}}
with nogil:
for i in range(n):
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0
{{endif}}
else:
{{if dtype == 'object'}}
for i in range(n):
value = values[i]
k = kh_get_{{ttype}}(table, <PyObject*>value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}(table, <PyObject*>value, &ret)
table.vals[k] = i
out[i] = 0
{{else}}
with nogil:
for i in range(n):
value = {{to_c_type}}(values[i])
k = kh_get_{{ttype}}(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}(table, value, &ret)
table.vals[k] = i
out[i] = 0
{{endif}}
kh_destroy_{{ttype}}(table)
return out
# ----------------------------------------------------------------------
# Membership
# ----------------------------------------------------------------------
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
{{else}}
cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
"""
Return boolean of values in arr on an
element by-element basis
Parameters
----------
arr : {{dtype}} ndarray
values : {{dtype}} ndarray
Returns
-------
boolean ndarry len of (arr)
"""
cdef:
Py_ssize_t i, n
khiter_t k
int ret = 0
ndarray[uint8_t] result
{{c_type}} val
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
# construct the table
n = len(values)
kh_resize_{{ttype}}(table, n)
{{if dtype == 'object'}}
for i in range(n):
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
{{else}}
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, val, &ret)
{{endif}}
# test membership
n = len(arr)
result = np.empty(n, dtype=np.uint8)
{{if dtype == 'object'}}
for i in range(n):
val = arr[i]
k = kh_get_{{ttype}}(table, <PyObject*>val)
result[i] = (k != table.n_buckets)
{{else}}
with nogil:
for i in range(n):
val = {{to_c_type}}(arr[i])
k = kh_get_{{ttype}}(table, val)
result[i] = (k != table.n_buckets)
{{endif}}
kh_destroy_{{ttype}}(table)
return result.view(np.bool_)
# ----------------------------------------------------------------------
# Mode Computations
# ----------------------------------------------------------------------
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
{{else}}
cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{endif}}
cdef:
{{if dtype == 'object'}}
ndarray[{{dtype}}] keys
ndarray[{{dtype}}] modes
{{else}}
{{dtype}}_t[:] keys
ndarray[{{dtype}}_t] modes
{{endif}}
int64_t[:] counts
int64_t count, max_count = -1
Py_ssize_t k, j = 0
keys, counts = value_count_{{dtype}}(values, dropna)
{{if dtype == 'object'}}
modes = np.empty(len(keys), dtype=np.object_)
{{else}}
modes = np.empty(len(keys), dtype=np.{{dtype}})
{{endif}}
{{if dtype != 'object'}}
with nogil:
for k in range(len(keys)):
count = counts[k]
if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = keys[k]
{{else}}
for k in range(len(keys)):
count = counts[k]
if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = keys[k]
{{endif}}
return modes[:j + 1]
{{endfor}}
ctypedef fused htfunc_t:
complex128_t
complex64_t
float64_t
float32_t
uint64_t
uint32_t
uint16_t
uint8_t
int64_t
int32_t
int16_t
int8_t
object
cpdef value_count(ndarray[htfunc_t] values, bint dropna):
if htfunc_t is object:
return value_count_object(values, dropna)
elif htfunc_t is int8_t:
return value_count_int8(values, dropna)
elif htfunc_t is int16_t:
return value_count_int16(values, dropna)
elif htfunc_t is int32_t:
return value_count_int32(values, dropna)
elif htfunc_t is int64_t:
return value_count_int64(values, dropna)
elif htfunc_t is uint8_t:
return value_count_uint8(values, dropna)
elif htfunc_t is uint16_t:
return value_count_uint16(values, dropna)
elif htfunc_t is uint32_t:
return value_count_uint32(values, dropna)
elif htfunc_t is uint64_t:
return value_count_uint64(values, dropna)
elif htfunc_t is float64_t:
return value_count_float64(values, dropna)
elif htfunc_t is float32_t:
return value_count_float32(values, dropna)
elif htfunc_t is complex128_t:
return value_count_complex128(values, dropna)
elif htfunc_t is complex64_t:
return value_count_complex64(values, dropna)
else:
raise TypeError(values.dtype)
cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
if htfunc_t is object:
return duplicated_object(values, keep)
elif htfunc_t is int8_t:
return duplicated_int8(values, keep)
elif htfunc_t is int16_t:
return duplicated_int16(values, keep)
elif htfunc_t is int32_t:
return duplicated_int32(values, keep)
elif htfunc_t is int64_t:
return duplicated_int64(values, keep)
elif htfunc_t is uint8_t:
return duplicated_uint8(values, keep)
elif htfunc_t is uint16_t:
return duplicated_uint16(values, keep)
elif htfunc_t is uint32_t:
return duplicated_uint32(values, keep)
elif htfunc_t is uint64_t:
return duplicated_uint64(values, keep)
elif htfunc_t is float64_t:
return duplicated_float64(values, keep)
elif htfunc_t is float32_t:
return duplicated_float32(values, keep)
elif htfunc_t is complex128_t:
return duplicated_complex128(values, keep)
elif htfunc_t is complex64_t:
return duplicated_complex64(values, keep)
else:
raise TypeError(values.dtype)
cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
if htfunc_t is object:
return ismember_object(arr, values)
elif htfunc_t is int8_t:
return ismember_int8(arr, values)
elif htfunc_t is int16_t:
return ismember_int16(arr, values)
elif htfunc_t is int32_t:
return ismember_int32(arr, values)
elif htfunc_t is int64_t:
return ismember_int64(arr, values)
elif htfunc_t is uint8_t:
return ismember_uint8(arr, values)
elif htfunc_t is uint16_t:
return ismember_uint16(arr, values)
elif htfunc_t is uint32_t:
return ismember_uint32(arr, values)
elif htfunc_t is uint64_t:
return ismember_uint64(arr, values)
elif htfunc_t is float64_t:
return ismember_float64(arr, values)
elif htfunc_t is float32_t:
return ismember_float32(arr, values)
elif htfunc_t is complex128_t:
return ismember_complex128(arr, values)
elif htfunc_t is complex64_t:
return ismember_complex64(arr, values)
else:
raise TypeError(values.dtype)
cpdef mode(ndarray[htfunc_t] values, bint dropna):
if htfunc_t is object:
return mode_object(values, dropna)
elif htfunc_t is int8_t:
return mode_int8(values, dropna)
elif htfunc_t is int16_t:
return mode_int16(values, dropna)
elif htfunc_t is int32_t:
return mode_int32(values, dropna)
elif htfunc_t is int64_t:
return mode_int64(values, dropna)
elif htfunc_t is uint8_t:
return mode_uint8(values, dropna)
elif htfunc_t is uint16_t:
return mode_uint16(values, dropna)
elif htfunc_t is uint32_t:
return mode_uint32(values, dropna)
elif htfunc_t is uint64_t:
return mode_uint64(values, dropna)
elif htfunc_t is float64_t:
return mode_float64(values, dropna)
elif htfunc_t is float32_t:
return mode_float32(values, dropna)
elif htfunc_t is complex128_t:
return mode_complex128(values, dropna)
elif htfunc_t is complex64_t:
return mode_complex64(values, dropna)
else:
raise TypeError(values.dtype)
{{py:
# name, dtype, ttype, c_type
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32', 'int32_t'), ]
}}
{{for name, dtype, ttype, c_type in dtypes}}
@cython.wraparound(False)
@cython.boundscheck(False)
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
"""
Indices of the first occurrences of the unique labels
*excluding* -1. equivalent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
{{name}}Vector idx = {{name}}Vector()
ndarray[{{c_type}}, ndim=1] arr
{{name}}VectorData *ud = idx.data
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
with nogil:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
append_data_{{ttype}}(ud, i)
kh_destroy_{{ttype}}(table)
arr = idx.to_array()
arr = arr[np.asarray(labels)[arr].argsort()]
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
{{endfor}}

View File

@@ -0,0 +1,65 @@
import numpy as np
from pandas._typing import npt
from pandas import MultiIndex
class IndexEngine:
over_size_threshold: bool
def __init__(self, values: np.ndarray): ...
def __contains__(self, val: object) -> bool: ...
# -> int | slice | np.ndarray[bool]
def get_loc(self, val: object) -> int | slice | np.ndarray: ...
def sizeof(self, deep: bool = ...) -> int: ...
def __sizeof__(self) -> int: ...
@property
def is_unique(self) -> bool: ...
@property
def is_monotonic_increasing(self) -> bool: ...
@property
def is_monotonic_decreasing(self) -> bool: ...
@property
def is_mapping_populated(self) -> bool: ...
def clear_mapping(self): ...
def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
def get_indexer_non_unique(
self,
targets: np.ndarray,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
class Float64Engine(IndexEngine): ...
class Float32Engine(IndexEngine): ...
class Int64Engine(IndexEngine): ...
class Int32Engine(IndexEngine): ...
class Int16Engine(IndexEngine): ...
class Int8Engine(IndexEngine): ...
class UInt64Engine(IndexEngine): ...
class UInt32Engine(IndexEngine): ...
class UInt16Engine(IndexEngine): ...
class UInt8Engine(IndexEngine): ...
class ObjectEngine(IndexEngine): ...
class DatetimeEngine(Int64Engine): ...
class TimedeltaEngine(DatetimeEngine): ...
class PeriodEngine(Int64Engine): ...
class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
def __init__(
self,
levels: list[np.ndarray], # all entries hashable
labels: list[np.ndarray], # all entries integer-dtyped
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
): ...
def get_indexer(
self,
target: npt.NDArray[np.object_],
) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
def get_indexer_with_fill(
self,
target: np.ndarray, # np.ndarray[object] of tuples
values: np.ndarray, # np.ndarray[object] of tuples
method: str,
limit: int | None,
) -> npt.NDArray[np.intp]: ...

View File

@@ -0,0 +1,799 @@
cimport cython
import numpy as np
cimport numpy as cnp
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
intp_t,
ndarray,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.hashtable cimport HashTable
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
from pandas._libs.tslibs.period cimport is_period_object
from pandas._libs.tslibs.timedeltas cimport _Timedelta
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs import (
algos,
hashtable as _hash,
)
from pandas._libs.lib cimport eq_NA_compat
from pandas._libs.missing cimport (
C_NA as NA,
checknull,
is_matching_na,
)
cdef inline bint is_definitely_invalid_key(object val):
try:
hash(val)
except TypeError:
return True
return False
cdef ndarray _get_bool_indexer(ndarray values, object val):
"""
Return a ndarray[bool] of locations where val matches self.values.
If val is not NA, this is equivalent to `self.values == val`
"""
# Caller is responsible for ensuring _check_type has already been called
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer
Py_ssize_t i
object item
if values.descr.type_num == cnp.NPY_OBJECT:
# i.e. values.dtype == object
if not checknull(val):
indexer = eq_NA_compat(values, val)
else:
# We need to check for _matching_ NA values
indexer = np.empty(len(values), dtype=np.uint8)
for i in range(len(values)):
item = values[i]
indexer[i] = is_matching_na(item, val)
else:
if util.is_nan(val):
indexer = np.isnan(values)
else:
indexer = values == val
return indexer.view(bool)
# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000
cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
"""
Possibly unpack a boolean mask to a single indexer.
"""
# Returns ndarray[bool] or int
cdef:
ndarray[intp_t, ndim=1] found
int count
found = np.where(indexer)[0]
count = len(found)
if count > 1:
return indexer
if count == 1:
return int(found[0])
raise KeyError(val)
@cython.freelist(32)
cdef class IndexEngine:
cdef readonly:
ndarray values
HashTable mapping
bint over_size_threshold
cdef:
bint unique, monotonic_inc, monotonic_dec
bint need_monotonic_check, need_unique_check
object _np_type
def __init__(self, ndarray values):
self.values = values
self.over_size_threshold = len(values) >= _SIZE_CUTOFF
self.clear_mapping()
self._np_type = values.dtype.type
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
self._ensure_mapping_populated()
return val in self.mapping
cpdef get_loc(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t loc
if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")
self._check_type(val)
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(val)
values = self.values
loc = self._searchsorted_left(val)
if loc >= len(values):
raise KeyError(val)
if values[loc] != val:
raise KeyError(val)
return loc
self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(val)
try:
return self.mapping.get_item(val)
except OverflowError as err:
# GH#41775 OverflowError e.g. if we are uint64 and val is -1
# or if we are int64 and value is np.iinfo(np.int64).max+1
# (the uint64 with -1 case should actually be excluded by _check_type)
raise KeyError(val) from err
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
"""
See ObjectEngine._searchsorted_left.__doc__.
"""
# Caller is responsible for ensuring _check_type has already been called
loc = self.values.searchsorted(self._np_type(val), side="left")
return loc
cdef inline _get_loc_duplicates(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t diff, left, right
if self.is_monotonic_increasing:
values = self.values
try:
left = values.searchsorted(val, side='left')
right = values.searchsorted(val, side='right')
except TypeError:
# e.g. GH#29189 get_loc(None) with a Float64Index
# 2021-09-29 Now only reached for object-dtype
raise KeyError(val)
diff = right - left
if diff == 0:
raise KeyError(val)
elif diff == 1:
return left
else:
return slice(left, right)
return self._maybe_get_bool_indexer(val)
cdef _maybe_get_bool_indexer(self, object val):
# Returns ndarray[bool] or int
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer
indexer = _get_bool_indexer(self.values, val)
return _unpack_bool_indexer(indexer, val)
def sizeof(self, deep: bool = False) -> int:
""" return the sizeof our mapping """
if not self.is_mapping_populated:
return 0
return self.mapping.sizeof(deep=deep)
def __sizeof__(self) -> int:
return self.sizeof()
@property
def is_unique(self) -> bool:
if self.need_unique_check:
self._do_unique_check()
return self.unique == 1
cdef inline _do_unique_check(self):
# this de-facto the same
self._ensure_mapping_populated()
@property
def is_monotonic_increasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()
return self.monotonic_inc == 1
@property
def is_monotonic_decreasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()
return self.monotonic_dec == 1
cdef inline _do_monotonic_check(self):
cdef:
bint is_unique
try:
values = self.values
self.monotonic_inc, self.monotonic_dec, is_unique = \
self._call_monotonic(values)
except TypeError:
self.monotonic_inc = 0
self.monotonic_dec = 0
is_unique = 0
self.need_monotonic_check = 0
# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.unique = 1
self.need_unique_check = 0
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=False)
cdef _make_hash_table(self, Py_ssize_t n):
raise NotImplementedError # pragma: no cover
cdef _check_type(self, object val):
hash(val)
@property
def is_mapping_populated(self) -> bool:
return self.mapping is not None
cdef inline _ensure_mapping_populated(self):
# this populates the mapping
# if its not already populated
# also satisfies the need_unique_check
if not self.is_mapping_populated:
values = self.values
self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)
if len(self.mapping) == len(values):
self.unique = 1
self.need_unique_check = 0
def clear_mapping(self):
self.mapping = None
self.need_monotonic_check = 1
self.need_unique_check = 1
self.unique = 0
self.monotonic_inc = 0
self.monotonic_dec = 0
def get_indexer(self, ndarray values) -> np.ndarray:
self._ensure_mapping_populated()
return self.mapping.lookup(values)
def get_indexer_non_unique(self, ndarray targets):
"""
Return an indexer suitable for taking from a non unique index
return the labels in the same order as the target
and a missing indexer into the targets (which correspond
to the -1 indices in the results
Returns
-------
indexer : np.ndarray[np.intp]
missing : np.ndarray[np.intp]
"""
cdef:
ndarray values
ndarray[intp_t] result, missing
set stargets, remaining_stargets, found_nas
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end
bint check_na_values = False
values = self.values
stargets = set(targets)
n = len(values)
n_t = len(targets)
if n > 10_000:
n_alloc = 10_000
else:
n_alloc = n
result = np.empty(n_alloc, dtype=np.intp)
missing = np.empty(n_t, dtype=np.intp)
# map each starget to its position in the index
if (
stargets and
len(stargets) < 5 and
not any([checknull(t) for t in stargets]) and
self.is_monotonic_increasing
):
# if there are few enough stargets and the index is monotonically
# increasing, then use binary search for each starget
remaining_stargets = set()
for starget in stargets:
try:
start = values.searchsorted(starget, side='left')
end = values.searchsorted(starget, side='right')
except TypeError: # e.g. if we tried to search for string in int array
remaining_stargets.add(starget)
else:
if start != end:
d[starget] = list(range(start, end))
stargets = remaining_stargets
if stargets:
# otherwise, map by iterating through all items in the index
# short-circuit na check
if values.dtype == object:
check_na_values = True
# keep track of nas in values
found_nas = set()
for i in range(n):
val = values[i]
# GH#43870
# handle lookup for nas
# (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]
# matching na not found
if not len(match):
found_nas.add(val)
# add na to stargets to utilize `in` for stargets/d lookup
match_stargets = [
x for x in stargets if is_matching_na(val, x)
]
if len(match_stargets):
# add our 'standardized' na
stargets.add(val)
# matching na found
else:
assert len(match) == 1
val = match[0]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)
for i in range(n_t):
val = targets[i]
# ensure there are nas in values before looking for a matching na
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]
if len(match):
assert len(match) == 1
val = match[0]
# found
if val in d:
key = val
for j in d[key]:
# realloc if needed
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result[count] = j
count += 1
# value not found
else:
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
count_missing += 1
return result[0:count], missing[0:count_missing]
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
# GH#1757 ndarray.searchsorted is not safe to use with array of tuples
# (treats a tuple `val` as a sequence of keys instead of a single key),
# so we implement something similar.
# This is equivalent to the stdlib's bisect.bisect_left
cdef:
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
object pval
if hi == 0 or (hi > 0 and val > values[hi]):
return len(values)
while lo < hi:
mid = (lo + hi) // 2
pval = values[mid]
if val < pval:
hi = mid
elif val > pval:
lo = mid + 1
else:
while mid > 0 and val == values[mid - 1]:
mid -= 1
return mid
if val <= values[mid]:
return mid
else:
return mid + 1
cdef class ObjectEngine(IndexEngine):
"""
Index Engine for use with object-dtype Index, namely the base class Index.
"""
cdef _make_hash_table(self, Py_ssize_t n):
return _hash.PyObjectHashTable(n)
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
# using values.searchsorted here would treat a tuple `val` as a sequence
# instead of a single key, so we use a different implementation
try:
loc = _bin_search(self.values, val)
except TypeError as err:
raise KeyError(val) from err
return loc
cdef class DatetimeEngine(Int64Engine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
# NB: caller is responsible for ensuring tzawareness compat
# before we get here
if not (isinstance(scalar, _Timestamp) or scalar is NaT):
raise TypeError(scalar)
return scalar.value
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
self._unbox_scalar(val)
try:
self.get_loc(val)
return True
except KeyError:
return False
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=True)
cpdef get_loc(self, object val):
# NB: the caller is responsible for ensuring that we are called
# with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
cdef:
Py_ssize_t loc
if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")
try:
conv = self._unbox_scalar(val)
except TypeError:
raise KeyError(val)
# Welcome to the spaghetti factory
if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(conv)
values = self.values
loc = values.searchsorted(conv, side='left')
if loc == len(values) or values[loc] != conv:
raise KeyError(val)
return loc
self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(conv)
try:
return self.mapping.get_item(conv)
except KeyError:
raise KeyError(val)
cdef class TimedeltaEngine(DatetimeEngine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
if not (isinstance(scalar, _Timedelta) or scalar is NaT):
raise TypeError(scalar)
return scalar.value
cdef class PeriodEngine(Int64Engine):
cdef int64_t _unbox_scalar(self, scalar) except? -1:
if scalar is NaT:
return scalar.value
if is_period_object(scalar):
# NB: we assume that we have the correct freq here.
return scalar.ordinal
raise TypeError(scalar)
cpdef get_loc(self, object val):
# NB: the caller is responsible for ensuring that we are called
# with either a Period or NaT
cdef:
int64_t conv
try:
conv = self._unbox_scalar(val)
except TypeError:
raise KeyError(val)
return Int64Engine.get_loc(self, conv)
cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=True)
cdef class BaseMultiIndexCodesEngine:
"""
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
represent each label in a MultiIndex as an integer, by juxtaposing the bits
encoding each level, with appropriate offsets.
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
then their labels can be represented using respectively 2, 3 and 1 bits,
as follows:
_ _ _ _____ _ __ __ __
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
— — — ————— — —— —— ——
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
— — — ————— — —— —— ——
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
and the resulting unsigned integer representation will be:
_ _ _ _____ _ __ __ __ __ __ __
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
Offsets are calculated at initialization, labels are transformed by method
_codes_to_ints.
Keys are located by first locating each component against the respective
level, then locating (the integer representation of) codes.
"""
def __init__(self, object levels, object labels,
ndarray[uint64_t, ndim=1] offsets):
"""
Parameters
----------
levels : list-like of numpy arrays
Levels of the MultiIndex.
labels : list-like of numpy arrays of integer dtype
Labels of the MultiIndex.
offsets : numpy array of uint64 dtype
Pre-calculated offsets, one for each level of the index.
"""
self.levels = levels
self.offsets = offsets
# Transform labels in a single array, and add 1 so that we are working
# with positive integers (-1 for NaN becomes 0):
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
copy=False)
# Map each codes combination in the index to an integer unambiguously
# (no collisions possible), based on the "offsets", which describe the
# number of bits to switch labels for each level:
lab_ints = self._codes_to_ints(codes)
# Initialize underlying index (e.g. libindex.UInt64Engine) with
# integers representing labels: we will use its get_loc and get_indexer
self._base.__init__(self, lab_ints)
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
raise NotImplementedError("Implemented by subclass") # pragma: no cover
def _extract_level_codes(self, target) -> np.ndarray:
"""
Map the requested list of (tuple) keys to their integer representations
for searching in the underlying integer index.
Parameters
----------
target : MultiIndex
Returns
------
int_keys : 1-dimensional array of dtype uint64 or object
Integers representing one combination each
"""
zt = [target._get_level_values(i) for i in range(target.nlevels)]
level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
in zip(self.levels, zt)]
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
def get_indexer(self, target: np.ndarray) -> np.ndarray:
"""
Returns an array giving the positions of each value of `target` in
`self.values`, where -1 represents a value in `target` which does not
appear in `self.values`
Parameters
----------
target : np.ndarray
Returns
-------
np.ndarray[intp_t, ndim=1] of the indexer of `target` into
`self.values`
"""
return self._base.get_indexer(self, target)
def get_indexer_with_fill(self, ndarray target, ndarray values,
str method, object limit) -> np.ndarray:
"""
Returns an array giving the positions of each value of `target` in
`values`, where -1 represents a value in `target` which does not
appear in `values`
If `method` is "backfill" then the position for a value in `target`
which does not appear in `values` is that of the next greater value
in `values` (if one exists), and -1 if there is no such value.
Similarly, if the method is "pad" then the position for a value in
`target` which does not appear in `values` is that of the next smaller
value in `values` (if one exists), and -1 if there is no such value.
Parameters
----------
target: ndarray[object] of tuples
need not be sorted, but all must have the same length, which must be
the same as the length of all tuples in `values`
values : ndarray[object] of tuples
must be sorted and all have the same length. Should be the set of
the MultiIndex's values.
method: string
"backfill" or "pad"
limit: int or None
if provided, limit the number of fills to this value
Returns
-------
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
filled with the `method` (and optionally `limit`) specified
"""
assert method in ("backfill", "pad")
cdef:
int64_t i, j, next_code
int64_t num_values, num_target_values
ndarray[int64_t, ndim=1] target_order
ndarray[object, ndim=1] target_values
ndarray[int64_t, ndim=1] new_codes, new_target_codes
ndarray[intp_t, ndim=1] sorted_indexer
target_order = np.argsort(target).astype('int64')
target_values = target[target_order]
num_values, num_target_values = len(values), len(target_values)
new_codes, new_target_codes = (
np.empty((num_values,)).astype('int64'),
np.empty((num_target_values,)).astype('int64'),
)
# `values` and `target_values` are both sorted, so we walk through them
# and memoize the (ordered) set of indices in the (implicit) merged-and
# sorted list of the two which belong to each of them
# the effect of this is to create a factorization for the (sorted)
# merger of the index values, where `new_codes` and `new_target_codes`
# are the subset of the factors which appear in `values` and `target`,
# respectively
i, j, next_code = 0, 0, 0
while i < num_values and j < num_target_values:
val, target_val = values[i], target_values[j]
if val <= target_val:
new_codes[i] = next_code
i += 1
if target_val <= val:
new_target_codes[j] = next_code
j += 1
next_code += 1
# at this point, at least one should have reached the end
# the remaining values of the other should be added to the end
assert i == num_values or j == num_target_values
while i < num_values:
new_codes[i] = next_code
i += 1
next_code += 1
while j < num_target_values:
new_target_codes[j] = next_code
j += 1
next_code += 1
# get the indexer, and undo the sorting of `target.values`
algo = algos.backfill if method == "backfill" else algos.pad
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
return sorted_indexer[np.argsort(target_order)]
def get_loc(self, object key):
if is_definitely_invalid_key(key):
raise TypeError(f"'{key}' is an invalid key")
if not isinstance(key, tuple):
raise KeyError(key)
try:
indices = [0 if checknull(v) else lev.get_loc(v) + 1
for lev, v in zip(self.levels, key)]
except KeyError:
raise KeyError(key)
# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
return self._base.get_loc(self, lab_int)
def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
indexer = self._base.get_indexer_non_unique(self, target)
return indexer
def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
# Default __contains__ looks in the underlying mapping, which in this
# case only contains integer representations.
try:
self.get_loc(val)
return True
except (KeyError, TypeError, ValueError):
return False
# Generated from template.
include "index_class_helper.pxi"

View File

@@ -0,0 +1,51 @@
"""
Template for functions of IndexEngine subclasses.
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# IndexEngine Subclass Methods
# ----------------------------------------------------------------------
{{py:
# name, dtype
dtypes = [('Float64', 'float64'),
('Float32', 'float32'),
('Int64', 'int64'),
('Int32', 'int32'),
('Int16', 'int16'),
('Int8', 'int8'),
('UInt64', 'uint64'),
('UInt32', 'uint32'),
('UInt16', 'uint16'),
('UInt8', 'uint8'),
]
}}
{{for name, dtype in dtypes}}
cdef class {{name}}Engine(IndexEngine):
cdef _make_hash_table(self, Py_ssize_t n):
return _hash.{{name}}HashTable(n)
cdef _check_type(self, object val):
{{if name not in {'Float64', 'Float32'} }}
if not util.is_integer_object(val):
raise KeyError(val)
{{if name.startswith("U")}}
if val < 0:
# cannot have negative values with unsigned int dtype
raise KeyError(val)
{{endif}}
{{else}}
if not util.is_integer_object(val) and not util.is_float_object(val):
# in particular catch bool and avoid casting True -> 1.0
raise KeyError(val)
{{endif}}
{{endfor}}

View File

@@ -0,0 +1,25 @@
cdef class NDFrameIndexerBase:
"""
A base class for _NDFrameIndexer for fast instantiation and attribute access.
"""
cdef public:
str name
object obj, _ndim
def __init__(self, name: str, obj):
self.obj = obj
self.name = name
self._ndim = None
@property
def ndim(self) -> int:
# Delay `ndim` instantiation until required as reading it
# from `obj` isn't entirely cheap.
ndim = self._ndim
if ndim is None:
ndim = self._ndim = self.obj.ndim
if ndim > 2:
raise ValueError( # pragma: no cover
"NDFrameIndexer does not support NDFrame objects with ndim > 2"
)
return ndim

View File

@@ -0,0 +1,85 @@
from typing import (
Iterator,
Sequence,
final,
overload,
)
import numpy as np
from pandas._typing import (
ArrayLike,
T,
npt,
)
from pandas import Index
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.internals.blocks import Block as B
def slice_len(slc: slice, objlen: int = ...) -> int: ...
def get_blkno_indexers(
blknos: np.ndarray, # int64_t[:]
group: bool = ...,
) -> list[tuple[int, slice | np.ndarray]]: ...
def get_blkno_placements(
blknos: np.ndarray,
group: bool = ...,
) -> Iterator[tuple[int, BlockPlacement]]: ...
def update_blklocs_and_blknos(
blklocs: npt.NDArray[np.intp],
blknos: npt.NDArray[np.intp],
loc: int,
nblocks: int,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
class BlockPlacement:
def __init__(self, val: int | slice | np.ndarray): ...
@property
def indexer(self) -> np.ndarray | slice: ...
@property
def as_array(self) -> np.ndarray: ...
@property
def as_slice(self) -> slice: ...
@property
def is_slice_like(self) -> bool: ...
@overload
def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ...
@overload
def __getitem__(self, loc: int) -> int: ...
def __iter__(self) -> Iterator[int]: ...
def __len__(self) -> int: ...
def delete(self, loc) -> BlockPlacement: ...
def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ...
class SharedBlock:
_mgr_locs: BlockPlacement
ndim: int
values: ArrayLike
def __init__(self, values: ArrayLike, placement: BlockPlacement, ndim: int): ...
class NumpyBlock(SharedBlock):
values: np.ndarray
@final
def getitem_block_index(self: T, slicer: slice) -> T: ...
class NDArrayBackedBlock(SharedBlock):
values: NDArrayBackedExtensionArray
@final
def getitem_block_index(self: T, slicer: slice) -> T: ...
class Block(SharedBlock): ...
class BlockManager:
blocks: tuple[B, ...]
axes: list[Index]
_known_consolidated: bool
_is_consolidated: bool
_blknos: np.ndarray
_blklocs: np.ndarray
def __init__(
self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
): ...
def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ...
def _rebuild_blknos_and_blklocs(self) -> None: ...

View File

@@ -0,0 +1,824 @@
from collections import defaultdict
import cython
from cython import Py_ssize_t
from cpython.slice cimport PySlice_GetIndicesEx
cdef extern from "Python.h":
Py_ssize_t PY_SSIZE_T_MAX
import numpy as np
cimport numpy as cnp
from numpy cimport (
NPY_INTP,
int64_t,
intp_t,
ndarray,
)
cnp.import_array()
from pandas._libs.algos import ensure_int64
from pandas._libs.arrays cimport NDArrayBacked
from pandas._libs.util cimport (
is_array,
is_integer_object,
)
@cython.final
@cython.freelist(32)
cdef class BlockPlacement:
# __slots__ = '_as_slice', '_as_array', '_len'
cdef:
slice _as_slice
ndarray _as_array # Note: this still allows `None`; will be intp_t
bint _has_slice, _has_array, _is_known_slice_like
def __cinit__(self, val):
cdef:
slice slc
self._as_slice = None
self._as_array = None
self._has_slice = False
self._has_array = False
if is_integer_object(val):
slc = slice(val, val + 1, 1)
self._as_slice = slc
self._has_slice = True
elif isinstance(val, slice):
slc = slice_canonize(val)
if slc.start != slc.stop:
self._as_slice = slc
self._has_slice = True
else:
arr = np.empty(0, dtype=np.intp)
self._as_array = arr
self._has_array = True
else:
# Cython memoryview interface requires ndarray to be writeable.
if (
not is_array(val)
or not cnp.PyArray_ISWRITEABLE(val)
or (<ndarray>val).descr.type_num != cnp.NPY_INTP
):
arr = np.require(val, dtype=np.intp, requirements='W')
else:
arr = val
# Caller is responsible for ensuring arr.ndim == 1
self._as_array = arr
self._has_array = True
def __str__(self) -> str:
cdef:
slice s = self._ensure_has_slice()
if s is not None:
v = self._as_slice
else:
v = self._as_array
return f"{type(self).__name__}({v})"
def __repr__(self) -> str:
return str(self)
def __len__(self) -> int:
cdef:
slice s = self._ensure_has_slice()
if s is not None:
return slice_len(s)
else:
return len(self._as_array)
def __iter__(self):
cdef:
slice s = self._ensure_has_slice()
Py_ssize_t start, stop, step, _
if s is not None:
start, stop, step, _ = slice_get_indices_ex(s)
return iter(range(start, stop, step))
else:
return iter(self._as_array)
@property
def as_slice(self) -> slice:
cdef:
slice s = self._ensure_has_slice()
if s is not None:
return s
else:
raise TypeError("Not slice-like")
@property
def indexer(self):
cdef:
slice s = self._ensure_has_slice()
if s is not None:
return s
else:
return self._as_array
@property
def as_array(self) -> np.ndarray:
cdef:
Py_ssize_t start, stop, end, _
if not self._has_array:
start, stop, step, _ = slice_get_indices_ex(self._as_slice)
# NOTE: this is the C-optimized equivalent of
# `np.arange(start, stop, step, dtype=np.intp)`
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
self._has_array = True
return self._as_array
@property
def is_slice_like(self) -> bool:
cdef:
slice s = self._ensure_has_slice()
return s is not None
def __getitem__(self, loc):
cdef:
slice s = self._ensure_has_slice()
if s is not None:
val = slice_getitem(s, loc)
else:
val = self._as_array[loc]
if not isinstance(val, slice) and val.ndim == 0:
return val
return BlockPlacement(val)
def delete(self, loc) -> BlockPlacement:
return BlockPlacement(np.delete(self.as_array, loc, axis=0))
def append(self, others) -> BlockPlacement:
if not len(others):
return self
return BlockPlacement(
np.concatenate([self.as_array] + [o.as_array for o in others])
)
cdef BlockPlacement iadd(self, other):
cdef:
slice s = self._ensure_has_slice()
Py_ssize_t other_int, start, stop, step
if is_integer_object(other) and s is not None:
other_int = <Py_ssize_t>other
if other_int == 0:
# BlockPlacement is treated as immutable
return self
start, stop, step, _ = slice_get_indices_ex(s)
start += other_int
stop += other_int
if (step > 0 and start < 0) or (step < 0 and stop < step):
raise ValueError("iadd causes length change")
if stop < 0:
val = slice(start, None, step)
else:
val = slice(start, stop, step)
return BlockPlacement(val)
else:
newarr = self.as_array + other
if (newarr < 0).any():
raise ValueError("iadd causes length change")
val = newarr
return BlockPlacement(val)
def add(self, other) -> BlockPlacement:
# We can get here with int or ndarray
return self.iadd(other)
cdef slice _ensure_has_slice(self):
if not self._has_slice:
self._as_slice = indexer_as_slice(self._as_array)
self._has_slice = True
return self._as_slice
cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
"""
Increment any entries of 'loc' or above by one.
"""
cdef:
slice nv, s = self._ensure_has_slice()
Py_ssize_t other_int, start, stop, step
ndarray[intp_t, ndim=1] newarr
if s is not None:
# see if we are either all-above or all-below, each of which
# have fastpaths available.
start, stop, step, _ = slice_get_indices_ex(s)
if start < loc and stop <= loc:
# We are entirely below, nothing to increment
return self
if start >= loc and stop >= loc:
# We are entirely above, we can efficiently increment out slice
nv = slice(start + 1, stop + 1, step)
return BlockPlacement(nv)
if loc == 0:
# fastpath where we know everything is >= 0
newarr = self.as_array + 1
return BlockPlacement(newarr)
newarr = self.as_array.copy()
newarr[newarr >= loc] += 1
return BlockPlacement(newarr)
def tile_for_unstack(self, factor: int) -> np.ndarray:
"""
Find the new mgr_locs for the un-stacked version of a Block.
"""
cdef:
slice slc = self._ensure_has_slice()
slice new_slice
ndarray[intp_t, ndim=1] new_placement
if slc is not None and slc.step == 1:
new_slc = slice(slc.start * factor, slc.stop * factor, 1)
# equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP)
else:
# Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
mapped = [
# equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP)
for x in self
]
new_placement = np.concatenate(mapped)
return new_placement
cdef slice slice_canonize(slice s):
"""
Convert slice to canonical bounded form.
"""
cdef:
Py_ssize_t start = 0, stop = 0, step = 1
if s.step is None:
step = 1
else:
step = <Py_ssize_t>s.step
if step == 0:
raise ValueError("slice step cannot be zero")
if step > 0:
if s.stop is None:
raise ValueError("unbounded slice")
stop = <Py_ssize_t>s.stop
if s.start is None:
start = 0
else:
start = <Py_ssize_t>s.start
if start > stop:
start = stop
elif step < 0:
if s.start is None:
raise ValueError("unbounded slice")
start = <Py_ssize_t>s.start
if s.stop is None:
stop = -1
else:
stop = <Py_ssize_t>s.stop
if stop > start:
stop = start
if start < 0 or (stop < 0 and s.stop is not None and step > 0):
raise ValueError("unbounded slice")
if stop < 0:
return slice(start, None, step)
else:
return slice(start, stop, step)
cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
"""
Get length of a bounded slice.
The slice must not have any "open" bounds that would create dependency on
container size, i.e.:
- if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None``
- if ``s.step < 0``, ``s.start`` is not ``None``
Otherwise, the result is unreliable.
"""
cdef:
Py_ssize_t start, stop, step, length
if slc is None:
raise TypeError("slc must be slice") # pragma: no cover
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
return length
cdef (Py_ssize_t, Py_ssize_t, Py_ssize_t, Py_ssize_t) slice_get_indices_ex(
slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX
):
"""
Get (start, stop, step, length) tuple for a slice.
If `objlen` is not specified, slice must be bounded, otherwise the result
will be wrong.
"""
cdef:
Py_ssize_t start, stop, step, length
if slc is None:
raise TypeError("slc should be a slice") # pragma: no cover
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
return start, stop, step, length
cdef slice_getitem(slice slc, ind):
cdef:
Py_ssize_t s_start, s_stop, s_step, s_len
Py_ssize_t ind_start, ind_stop, ind_step, ind_len
s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc)
if isinstance(ind, slice):
ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len)
if ind_step > 0 and ind_len == s_len:
# short-cut for no-op slice
if ind_len == s_len:
return slc
if ind_step < 0:
s_start = s_stop - s_step
ind_step = -ind_step
s_step *= ind_step
s_stop = s_start + ind_stop * s_step
s_start = s_start + ind_start * s_step
if s_step < 0 and s_stop < 0:
return slice(s_start, None, s_step)
else:
return slice(s_start, s_stop, s_step)
else:
# NOTE:
# this is the C-optimized equivalent of
# `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]
@cython.boundscheck(False)
@cython.wraparound(False)
cdef slice indexer_as_slice(intp_t[:] vals):
cdef:
Py_ssize_t i, n, start, stop
int64_t d
if vals is None:
raise TypeError("vals must be ndarray") # pragma: no cover
n = vals.shape[0]
if n == 0 or vals[0] < 0:
return None
if n == 1:
return slice(vals[0], vals[0] + 1, 1)
if vals[1] < 0:
return None
# n > 2
d = vals[1] - vals[0]
if d == 0:
return None
for i in range(2, n):
if vals[i] < 0 or vals[i] - vals[i - 1] != d:
return None
start = vals[0]
stop = start + n * d
if stop < 0 and d < 0:
return slice(start, None, d)
else:
return slice(start, stop, d)
@cython.boundscheck(False)
@cython.wraparound(False)
def get_blkno_indexers(
int64_t[:] blknos, bint group=True
) -> list[tuple[int, slice | np.ndarray]]:
"""
Enumerate contiguous runs of integers in ndarray.
Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))``
pairs for each contiguous run found.
If `group` is True and there is more than one run for a certain blkno,
``(blkno, array)`` with an array containing positions of all elements equal
to blkno.
Returns
-------
list[tuple[int, slice | np.ndarray]]
"""
# There's blkno in this function's name because it's used in block &
# blockno handling.
cdef:
int64_t cur_blkno
Py_ssize_t i, start, stop, n, diff
cnp.npy_intp tot_len
int64_t blkno
object group_dict = defaultdict(list)
ndarray[int64_t, ndim=1] arr
n = blknos.shape[0]
result = list()
start = 0
cur_blkno = blknos[start]
if n == 0:
pass
elif group is False:
for i in range(1, n):
if blknos[i] != cur_blkno:
result.append((cur_blkno, slice(start, i)))
start = i
cur_blkno = blknos[i]
result.append((cur_blkno, slice(start, n)))
else:
for i in range(1, n):
if blknos[i] != cur_blkno:
group_dict[cur_blkno].append((start, i))
start = i
cur_blkno = blknos[i]
group_dict[cur_blkno].append((start, n))
for blkno, slices in group_dict.items():
if len(slices) == 1:
result.append((blkno, slice(slices[0][0], slices[0][1])))
else:
tot_len = sum(stop - start for start, stop in slices)
# equiv np.empty(tot_len, dtype=np.int64)
arr = cnp.PyArray_EMPTY(1, &tot_len, cnp.NPY_INT64, 0)
i = 0
for start, stop in slices:
for diff in range(start, stop):
arr[i] = diff
i += 1
result.append((blkno, arr))
return result
def get_blkno_placements(blknos, group: bool = True):
"""
Parameters
----------
blknos : np.ndarray[int64]
group : bool, default True
Returns
-------
iterator
yield (blkno, BlockPlacement)
"""
blknos = ensure_int64(blknos)
for blkno, indexer in get_blkno_indexers(blknos, group):
yield blkno, BlockPlacement(indexer)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef update_blklocs_and_blknos(
ndarray[intp_t, ndim=1] blklocs,
ndarray[intp_t, ndim=1] blknos,
Py_ssize_t loc,
intp_t nblocks,
):
"""
Update blklocs and blknos when a new column is inserted at 'loc'.
"""
cdef:
Py_ssize_t i
cnp.npy_intp length = len(blklocs) + 1
ndarray[intp_t, ndim=1] new_blklocs, new_blknos
# equiv: new_blklocs = np.empty(length, dtype=np.intp)
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
for i in range(loc):
new_blklocs[i] = blklocs[i]
new_blknos[i] = blknos[i]
new_blklocs[loc] = 0
new_blknos[loc] = nblocks
for i in range(loc, length - 1):
new_blklocs[i + 1] = blklocs[i]
new_blknos[i + 1] = blknos[i]
return new_blklocs, new_blknos
def _unpickle_block(values, placement, ndim):
# We have to do some gymnastics b/c "ndim" is keyword-only
from pandas.core.internals.blocks import new_block
return new_block(values, placement, ndim=ndim)
@cython.freelist(64)
cdef class SharedBlock:
"""
Defining __init__ in a cython class significantly improves performance.
"""
cdef:
public BlockPlacement _mgr_locs
readonly int ndim
def __cinit__(self, values, placement: BlockPlacement, ndim: int):
"""
Parameters
----------
values : np.ndarray or ExtensionArray
We assume maybe_coerce_values has already been called.
placement : BlockPlacement
ndim : int
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
"""
self._mgr_locs = placement
self.ndim = ndim
cpdef __reduce__(self):
args = (self.values, self.mgr_locs.indexer, self.ndim)
return _unpickle_block, args
cpdef __setstate__(self, state):
from pandas.core.construction import extract_array
self.mgr_locs = BlockPlacement(state[0])
self.values = extract_array(state[1], extract_numpy=True)
if len(state) > 2:
# we stored ndim
self.ndim = state[2]
else:
# older pickle
from pandas.core.internals.api import maybe_infer_ndim
ndim = maybe_infer_ndim(self.values, self.mgr_locs)
self.ndim = ndim
cdef class NumpyBlock(SharedBlock):
cdef:
public ndarray values
def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
# set values here the (implicit) call to SharedBlock.__cinit__ will
# set placement and ndim
self.values = values
cpdef NumpyBlock getitem_block_index(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.
Assumes self.ndim == 2
"""
new_values = self.values[..., slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
cdef class NDArrayBackedBlock(SharedBlock):
"""
Block backed by NDArrayBackedExtensionArray
"""
cdef public:
NDArrayBacked values
def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
# set values here the (implicit) call to SharedBlock.__cinit__ will
# set placement and ndim
self.values = values
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.
Assumes self.ndim == 2
"""
new_values = self.values[..., slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
cdef class Block(SharedBlock):
cdef:
public object values
def __cinit__(self, object values, BlockPlacement placement, int ndim):
# set values here the (implicit) call to SharedBlock.__cinit__ will
# set placement and ndim
self.values = values
@cython.freelist(64)
cdef class BlockManager:
cdef:
public tuple blocks
public list axes
public bint _known_consolidated, _is_consolidated
public ndarray _blknos, _blklocs
def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
# None as defaults for unpickling GH#42345
if blocks is None:
# This adds 1-2 microseconds to DataFrame(np.array([]))
return
if isinstance(blocks, list):
# Backward compat for e.g. pyarrow
blocks = tuple(blocks)
self.blocks = blocks
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
# Populate known_consolidate, blknos, and blklocs lazily
self._known_consolidated = False
self._is_consolidated = False
self._blknos = None
self._blklocs = None
# -------------------------------------------------------------------
# Block Placement
def _rebuild_blknos_and_blklocs(self) -> None:
"""
Update mgr._blknos / mgr._blklocs.
"""
cdef:
intp_t blkno, i, j
cnp.npy_intp length = self.shape[0]
SharedBlock blk
BlockPlacement bp
ndarray[intp_t, ndim=1] new_blknos, new_blklocs
# equiv: np.empty(length, dtype=np.intp)
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
# equiv: new_blknos.fill(-1)
cnp.PyArray_FILLWBYTE(new_blknos, -1)
cnp.PyArray_FILLWBYTE(new_blklocs, -1)
for blkno, blk in enumerate(self.blocks):
bp = blk._mgr_locs
# Iterating over `bp` is a faster equivalent to
# new_blknos[bp.indexer] = blkno
# new_blklocs[bp.indexer] = np.arange(len(bp))
for i, j in enumerate(bp):
new_blknos[j] = blkno
new_blklocs[j] = i
for i in range(length):
# faster than `for blkno in new_blknos`
# https://github.com/cython/cython/issues/4393
blkno = new_blknos[i]
# If there are any -1s remaining, this indicates that our mgr_locs
# are invalid.
if blkno == -1:
raise AssertionError("Gaps in blk ref_locs")
self._blknos = new_blknos
self._blklocs = new_blklocs
# -------------------------------------------------------------------
# Pickle
cpdef __reduce__(self):
if len(self.axes) == 1:
# SingleBlockManager, __init__ expects Block, axis
args = (self.blocks[0], self.axes[0])
else:
args = (self.blocks, self.axes)
return type(self), args
cpdef __setstate__(self, state):
from pandas.core.construction import extract_array
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block,
)
from pandas.core.internals.managers import ensure_index
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
state = state[3]["0.14.1"]
axes = [ensure_index(ax) for ax in state["axes"]]
ndim = len(axes)
for blk in state["blocks"]:
vals = blk["values"]
# older versions may hold e.g. DatetimeIndex instead of DTA
vals = extract_array(vals, extract_numpy=True)
blk["values"] = ensure_block_shape(vals, ndim=ndim)
nbs = [
new_block(blk["values"], blk["mgr_locs"], ndim=ndim)
for blk in state["blocks"]
]
blocks = tuple(nbs)
self.blocks = blocks
self.axes = axes
else: # pragma: no cover
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
self._post_setstate()
def _post_setstate(self) -> None:
self._is_consolidated = False
self._known_consolidated = False
self._rebuild_blknos_and_blklocs()
# -------------------------------------------------------------------
# Indexing
cdef BlockManager _get_index_slice(self, slobj):
cdef:
SharedBlock blk, nb
BlockManager mgr
ndarray blknos, blklocs
nbs = []
for blk in self.blocks:
nb = blk.getitem_block_index(slobj)
nbs.append(nb)
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
# We can avoid having to rebuild blklocs/blknos
blklocs = self._blklocs
blknos = self._blknos
if blknos is not None:
mgr._blknos = blknos.copy()
mgr._blklocs = blklocs.copy()
return mgr
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
elif axis == 1:
return self._get_index_slice(slobj)
else:
raise IndexError("Requested axis not found in manager")
new_axes = list(self.axes)
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)

View File

@@ -0,0 +1,557 @@
import numbers
from operator import (
le,
lt,
)
from cpython.datetime cimport (
PyDateTime_IMPORT,
PyDelta_Check,
)
PyDateTime_IMPORT
from cpython.object cimport (
Py_EQ,
Py_GE,
Py_GT,
Py_LE,
Py_LT,
Py_NE,
PyObject_RichCompare,
)
import cython
from cython import Py_ssize_t
import numpy as np
cimport numpy as cnp
from numpy cimport (
NPY_QUICKSORT,
PyArray_ArgSort,
PyArray_Take,
float32_t,
float64_t,
int32_t,
int64_t,
ndarray,
uint64_t,
)
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.hashtable cimport Int64Vector
from pandas._libs.tslibs.timedeltas cimport _Timedelta
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport tz_compare
from pandas._libs.tslibs.util cimport (
is_float_object,
is_integer_object,
is_timedelta64_object,
)
VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
cdef class IntervalMixin:
@property
def closed_left(self):
"""
Check if the interval is closed on the left side.
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
Returns
-------
bool
True if the Interval is closed on the left-side.
"""
return self.closed in ('left', 'both')
@property
def closed_right(self):
"""
Check if the interval is closed on the right side.
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
Returns
-------
bool
True if the Interval is closed on the left-side.
"""
return self.closed in ('right', 'both')
@property
def open_left(self):
"""
Check if the interval is open on the left side.
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
Returns
-------
bool
True if the Interval is closed on the left-side.
"""
return not self.closed_left
@property
def open_right(self):
"""
Check if the interval is open on the right side.
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
Returns
-------
bool
True if the Interval is closed on the left-side.
"""
return not self.closed_right
@property
def mid(self):
"""
Return the midpoint of the Interval.
"""
try:
return 0.5 * (self.left + self.right)
except TypeError:
# datetime safe version
return self.left + 0.5 * self.length
@property
def length(self):
"""
Return the length of the Interval.
"""
return self.right - self.left
@property
def is_empty(self):
"""
Indicates if an interval is empty, meaning it contains no points.
.. versionadded:: 0.25.0
Returns
-------
bool or ndarray
A boolean indicating if a scalar :class:`Interval` is empty, or a
boolean ``ndarray`` positionally indicating if an ``Interval`` in
an :class:`~arrays.IntervalArray` or :class:`IntervalIndex` is
empty.
Examples
--------
An :class:`Interval` that contains points is not empty:
>>> pd.Interval(0, 1, closed='right').is_empty
False
An ``Interval`` that does not contain any points is empty:
>>> pd.Interval(0, 0, closed='right').is_empty
True
>>> pd.Interval(0, 0, closed='left').is_empty
True
>>> pd.Interval(0, 0, closed='neither').is_empty
True
An ``Interval`` that contains a single point is not empty:
>>> pd.Interval(0, 0, closed='both').is_empty
False
An :class:`~arrays.IntervalArray` or :class:`IntervalIndex` returns a
boolean ``ndarray`` positionally indicating if an ``Interval`` is
empty:
>>> ivs = [pd.Interval(0, 0, closed='neither'),
... pd.Interval(1, 2, closed='neither')]
>>> pd.arrays.IntervalArray(ivs).is_empty
array([ True, False])
Missing values are not considered empty:
>>> ivs = [pd.Interval(0, 0, closed='neither'), np.nan]
>>> pd.IntervalIndex(ivs).is_empty
array([ True, False])
"""
return (self.right == self.left) & (self.closed != 'both')
def _check_closed_matches(self, other, name='other'):
"""
Check if the closed attribute of `other` matches.
Note that 'left' and 'right' are considered different from 'both'.
Parameters
----------
other : Interval, IntervalIndex, IntervalArray
name : str
Name to use for 'other' in the error message.
Raises
------
ValueError
When `other` is not closed exactly the same as self.
"""
if self.closed != other.closed:
raise ValueError(f"'{name}.closed' is {repr(other.closed)}, "
f"expected {repr(self.closed)}.")
cdef bint _interval_like(other):
return (hasattr(other, 'left')
and hasattr(other, 'right')
and hasattr(other, 'closed'))
cdef class Interval(IntervalMixin):
"""
Immutable object implementing an Interval, a bounded slice-like interval.
Parameters
----------
left : orderable scalar
Left bound for the interval.
right : orderable scalar
Right bound for the interval.
closed : {'right', 'left', 'both', 'neither'}, default 'right'
Whether the interval is closed on the left-side, right-side, both or
neither. See the Notes for more detailed explanation.
See Also
--------
IntervalIndex : An Index of Interval objects that are all closed on the
same side.
cut : Convert continuous data into discrete bins (Categorical
of Interval objects).
qcut : Convert continuous data into bins (Categorical of Interval objects)
based on quantiles.
Period : Represents a period of time.
Notes
-----
The parameters `left` and `right` must be from the same type, you must be
able to compare them and they must satisfy ``left <= right``.
A closed interval (in mathematics denoted by square brackets) contains
its endpoints, i.e. the closed interval ``[0, 5]`` is characterized by the
conditions ``0 <= x <= 5``. This is what ``closed='both'`` stands for.
An open interval (in mathematics denoted by parentheses) does not contain
its endpoints, i.e. the open interval ``(0, 5)`` is characterized by the
conditions ``0 < x < 5``. This is what ``closed='neither'`` stands for.
Intervals can also be half-open or half-closed, i.e. ``[0, 5)`` is
described by ``0 <= x < 5`` (``closed='left'``) and ``(0, 5]`` is
described by ``0 < x <= 5`` (``closed='right'``).
Examples
--------
It is possible to build Intervals of different types, like numeric ones:
>>> iv = pd.Interval(left=0, right=5)
>>> iv
Interval(0, 5, closed='right')
You can check if an element belongs to it
>>> 2.5 in iv
True
You can test the bounds (``closed='right'``, so ``0 < x <= 5``):
>>> 0 in iv
False
>>> 5 in iv
True
>>> 0.0001 in iv
True
Calculate its length
>>> iv.length
5
You can operate with `+` and `*` over an Interval and the operation
is applied to each of its bounds, so the result depends on the type
of the bound elements
>>> shifted_iv = iv + 3
>>> shifted_iv
Interval(3, 8, closed='right')
>>> extended_iv = iv * 10.0
>>> extended_iv
Interval(0.0, 50.0, closed='right')
To create a time interval you can use Timestamps as the bounds
>>> year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
... pd.Timestamp('2018-01-01 00:00:00'),
... closed='left')
>>> pd.Timestamp('2017-01-01 00:00') in year_2017
True
>>> year_2017.length
Timedelta('365 days 00:00:00')
"""
_typ = "interval"
__array_priority__ = 1000
cdef readonly object left
"""
Left bound for the interval.
"""
cdef readonly object right
"""
Right bound for the interval.
"""
cdef readonly str closed
"""
Whether the interval is closed on the left-side, right-side, both or
neither.
"""
def __init__(self, left, right, str closed='right'):
# note: it is faster to just do these checks than to use a special
# constructor (__cinit__/__new__) to avoid them
self._validate_endpoint(left)
self._validate_endpoint(right)
if closed not in VALID_CLOSED:
raise ValueError(f"invalid option for 'closed': {closed}")
if not left <= right:
raise ValueError("left side of interval must be <= right side")
if (isinstance(left, _Timestamp) and
not tz_compare(left.tzinfo, right.tzinfo)):
# GH 18538
raise ValueError("left and right must have the same time zone, got "
f"{repr(left.tzinfo)}' and {repr(right.tzinfo)}")
self.left = left
self.right = right
self.closed = closed
def _validate_endpoint(self, endpoint):
# GH 23013
if not (is_integer_object(endpoint) or is_float_object(endpoint) or
isinstance(endpoint, (_Timestamp, _Timedelta))):
raise ValueError("Only numeric, Timestamp and Timedelta endpoints "
"are allowed when constructing an Interval.")
def __hash__(self):
return hash((self.left, self.right, self.closed))
def __contains__(self, key) -> bool:
if _interval_like(key):
raise TypeError("__contains__ not defined for two intervals")
return ((self.left < key if self.open_left else self.left <= key) and
(key < self.right if self.open_right else key <= self.right))
def __richcmp__(self, other, op: int):
if isinstance(other, Interval):
self_tuple = (self.left, self.right, self.closed)
other_tuple = (other.left, other.right, other.closed)
return PyObject_RichCompare(self_tuple, other_tuple, op)
elif util.is_array(other):
return np.array(
[PyObject_RichCompare(self, x, op) for x in other],
dtype=bool,
)
return NotImplemented
def __reduce__(self):
args = (self.left, self.right, self.closed)
return (type(self), args)
def _repr_base(self):
left = self.left
right = self.right
# TODO: need more general formatting methodology here
if isinstance(left, _Timestamp) and isinstance(right, _Timestamp):
left = left._short_repr
right = right._short_repr
return left, right
def __repr__(self) -> str:
left, right = self._repr_base()
name = type(self).__name__
repr_str = f'{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})'
return repr_str
def __str__(self) -> str:
left, right = self._repr_base()
start_symbol = '[' if self.closed_left else '('
end_symbol = ']' if self.closed_right else ')'
return f'{start_symbol}{left}, {right}{end_symbol}'
def __add__(self, y):
if (
isinstance(y, numbers.Number)
or PyDelta_Check(y)
or is_timedelta64_object(y)
):
return Interval(self.left + y, self.right + y, closed=self.closed)
elif (
isinstance(y, Interval)
and (
isinstance(self, numbers.Number)
or PyDelta_Check(self)
or is_timedelta64_object(self)
)
):
return Interval(y.left + self, y.right + self, closed=y.closed)
return NotImplemented
def __sub__(self, y):
if (
isinstance(y, numbers.Number)
or PyDelta_Check(y)
or is_timedelta64_object(y)
):
return Interval(self.left - y, self.right - y, closed=self.closed)
return NotImplemented
def __mul__(self, y):
if isinstance(y, numbers.Number):
return Interval(self.left * y, self.right * y, closed=self.closed)
elif isinstance(y, Interval) and isinstance(self, numbers.Number):
return Interval(y.left * self, y.right * self, closed=y.closed)
return NotImplemented
def __truediv__(self, y):
if isinstance(y, numbers.Number):
return Interval(self.left / y, self.right / y, closed=self.closed)
return NotImplemented
def __floordiv__(self, y):
if isinstance(y, numbers.Number):
return Interval(
self.left // y, self.right // y, closed=self.closed)
return NotImplemented
def overlaps(self, other):
"""
Check whether two Interval objects overlap.
Two intervals overlap if they share a common point, including closed
endpoints. Intervals that only have an open endpoint in common do not
overlap.
Parameters
----------
other : Interval
Interval to check against for an overlap.
Returns
-------
bool
True if the two intervals overlap.
See Also
--------
IntervalArray.overlaps : The corresponding method for IntervalArray.
IntervalIndex.overlaps : The corresponding method for IntervalIndex.
Examples
--------
>>> i1 = pd.Interval(0, 2)
>>> i2 = pd.Interval(1, 3)
>>> i1.overlaps(i2)
True
>>> i3 = pd.Interval(4, 5)
>>> i1.overlaps(i3)
False
Intervals that share closed endpoints overlap:
>>> i4 = pd.Interval(0, 1, closed='both')
>>> i5 = pd.Interval(1, 2, closed='both')
>>> i4.overlaps(i5)
True
Intervals that only have an open endpoint in common do not overlap:
>>> i6 = pd.Interval(1, 2, closed='neither')
>>> i4.overlaps(i6)
False
"""
if not isinstance(other, Interval):
raise TypeError("`other` must be an Interval, "
f"got {type(other).__name__}")
# equality is okay if both endpoints are closed (overlap at a point)
op1 = le if (self.closed_left and other.closed_right) else lt
op2 = le if (other.closed_left and self.closed_right) else lt
# overlaps is equivalent negation of two interval being disjoint:
# disjoint = (A.left > B.right) or (B.left > A.right)
# (simplifying the negation allows this to be done in less operations)
return op1(self.left, other.right) and op2(other.left, self.right)
@cython.wraparound(False)
@cython.boundscheck(False)
def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True):
"""
Parameters
----------
intervals : ndarray
Object array of Intervals / nulls.
validate_closed: bool, default True
Boolean indicating if all intervals must be closed on the same side.
Mismatching closed will raise if True, else return None for closed.
Returns
-------
tuple of
left : ndarray
right : ndarray
closed: str
"""
cdef:
object closed = None, interval
Py_ssize_t i, n = len(intervals)
ndarray left, right
bint seen_closed = False
left = np.empty(n, dtype=intervals.dtype)
right = np.empty(n, dtype=intervals.dtype)
for i in range(n):
interval = intervals[i]
if interval is None or util.is_nan(interval):
left[i] = np.nan
right[i] = np.nan
continue
if not isinstance(interval, Interval):
raise TypeError(f"type {type(interval)} with value "
f"{interval} is not an interval")
left[i] = interval.left
right[i] = interval.right
if not seen_closed:
seen_closed = True
closed = interval.closed
elif closed != interval.closed:
closed = None
if validate_closed:
raise ValueError("intervals must all be closed on the same side")
return left, right, closed
include "intervaltree.pxi"

View File

@@ -0,0 +1,427 @@
"""
Template for intervaltree
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
from pandas._libs.algos import is_monotonic
ctypedef fused int_scalar_t:
int64_t
float64_t
ctypedef fused uint_scalar_t:
uint64_t
float64_t
ctypedef fused scalar_t:
int_scalar_t
uint_scalar_t
# ----------------------------------------------------------------------
# IntervalTree
# ----------------------------------------------------------------------
cdef class IntervalTree(IntervalMixin):
"""A centered interval tree
Based off the algorithm described on Wikipedia:
https://en.wikipedia.org/wiki/Interval_tree
we are emulating the IndexEngine interface
"""
cdef readonly:
ndarray left, right
IntervalNode root
object dtype
str closed
object _is_overlapping, _left_sorter, _right_sorter
Py_ssize_t _na_count
def __init__(self, left, right, closed='right', leaf_size=100):
"""
Parameters
----------
left, right : np.ndarray[ndim=1]
Left and right bounds for each interval. Assumed to contain no
NaNs.
closed : {'left', 'right', 'both', 'neither'}, optional
Whether the intervals are closed on the left-side, right-side, both
or neither. Defaults to 'right'.
leaf_size : int, optional
Parameter that controls when the tree switches from creating nodes
to brute-force search. Tune this parameter to optimize query
performance.
"""
if closed not in ['left', 'right', 'both', 'neither']:
raise ValueError("invalid option for 'closed': %s" % closed)
left = np.asarray(left)
right = np.asarray(right)
self.dtype = np.result_type(left, right)
self.left = np.asarray(left, dtype=self.dtype)
self.right = np.asarray(right, dtype=self.dtype)
indices = np.arange(len(left), dtype='int64')
self.closed = closed
# GH 23352: ensure no nan in nodes
mask = ~np.isnan(self.left)
self._na_count = len(mask) - mask.sum()
self.left = self.left[mask]
self.right = self.right[mask]
indices = indices[mask]
node_cls = NODE_CLASSES[str(self.dtype), closed]
self.root = node_cls(self.left, self.right, indices, leaf_size)
@property
def left_sorter(self) -> np.ndarray:
"""How to sort the left labels; this is used for binary search
"""
if self._left_sorter is None:
self._left_sorter = np.argsort(self.left)
return self._left_sorter
@property
def right_sorter(self) -> np.ndarray:
"""How to sort the right labels
"""
if self._right_sorter is None:
self._right_sorter = np.argsort(self.right)
return self._right_sorter
@property
def is_overlapping(self) -> bool:
"""
Determine if the IntervalTree contains overlapping intervals.
Cached as self._is_overlapping.
"""
if self._is_overlapping is not None:
return self._is_overlapping
# <= when both sides closed since endpoints can overlap
op = le if self.closed == 'both' else lt
# overlap if start of current interval < end of previous interval
# (current and previous in terms of sorted order by left/start side)
current = self.left[self.left_sorter[1:]]
previous = self.right[self.left_sorter[:-1]]
self._is_overlapping = bool(op(current, previous).any())
return self._is_overlapping
@property
def is_monotonic_increasing(self) -> bool:
"""
Return True if the IntervalTree is monotonic increasing (only equal or
increasing values), else False
"""
if self._na_count > 0:
return False
values = [self.right, self.left]
sort_order = np.lexsort(values)
return is_monotonic(sort_order, False)[0]
def get_indexer(self, scalar_t[:] target) -> np.ndarray:
"""Return the positions corresponding to unique intervals that overlap
with the given array of scalar targets.
"""
# TODO: write get_indexer_intervals
cdef:
Py_ssize_t old_len
Py_ssize_t i
Int64Vector result
result = Int64Vector()
old_len = 0
for i in range(len(target)):
try:
self.root.query(result, target[i])
except OverflowError:
# overflow -> no match, which is already handled below
pass
if result.data.n == old_len:
result.append(-1)
elif result.data.n > old_len + 1:
raise KeyError(
'indexer does not intersect a unique set of intervals')
old_len = result.data.n
return result.to_array().astype('intp')
def get_indexer_non_unique(self, scalar_t[:] target):
"""Return the positions corresponding to intervals that overlap with
the given array of scalar targets. Non-unique positions are repeated.
"""
cdef:
Py_ssize_t old_len
Py_ssize_t i
Int64Vector result, missing
result = Int64Vector()
missing = Int64Vector()
old_len = 0
for i in range(len(target)):
try:
self.root.query(result, target[i])
except OverflowError:
# overflow -> no match, which is already handled below
pass
if result.data.n == old_len:
result.append(-1)
missing.append(i)
old_len = result.data.n
return (result.to_array().astype('intp'),
missing.to_array().astype('intp'))
def __repr__(self) -> str:
return ('<IntervalTree[{dtype},{closed}]: '
'{n_elements} elements>'.format(
dtype=self.dtype, closed=self.closed,
n_elements=self.root.n_elements))
# compat with IndexEngine interface
def clear_mapping(self) -> None:
pass
cdef take(ndarray source, ndarray indices):
"""Take the given positions from a 1D ndarray
"""
return PyArray_Take(source, indices, 0)
cdef sort_values_and_indices(all_values, all_indices, subset):
indices = take(all_indices, subset)
values = take(all_values, subset)
sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
sorted_values = take(values, sorter)
sorted_indices = take(indices, sorter)
return sorted_values, sorted_indices
# ----------------------------------------------------------------------
# Nodes
# ----------------------------------------------------------------------
@cython.internal
cdef class IntervalNode:
cdef readonly:
int64_t n_elements, n_center, leaf_size
bint is_leaf_node
def __repr__(self) -> str:
if self.is_leaf_node:
return (
f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
)
else:
n_left = self.left_node.n_elements
n_right = self.right_node.n_elements
n_center = self.n_elements - n_left - n_right
return (
f"<{type(self).__name__}: "
f"pivot {self.pivot}, {self.n_elements} elements "
f"({n_left} left, {n_right} right, {n_center} overlapping)>"
)
def counts(self):
"""
Inspect counts on this node
useful for debugging purposes
"""
if self.is_leaf_node:
return self.n_elements
else:
m = len(self.center_left_values)
l = self.left_node.counts()
r = self.right_node.counts()
return (m, (l, r))
# we need specialized nodes and leaves to optimize for different dtype and
# closed values
{{py:
nodes = []
for dtype in ['float64', 'int64', 'uint64']:
for closed, cmp_left, cmp_right in [
('left', '<=', '<'),
('right', '<', '<='),
('both', '<=', '<='),
('neither', '<', '<')]:
cmp_left_converse = '<' if cmp_left == '<=' else '<='
cmp_right_converse = '<' if cmp_right == '<=' else '<='
if dtype.startswith('int'):
fused_prefix = 'int_'
elif dtype.startswith('uint'):
fused_prefix = 'uint_'
elif dtype.startswith('float'):
fused_prefix = ''
nodes.append((dtype, dtype.title(),
closed, closed.title(),
cmp_left,
cmp_right,
cmp_left_converse,
cmp_right_converse,
fused_prefix))
}}
NODE_CLASSES = {}
{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right,
cmp_left_converse, cmp_right_converse, fused_prefix in nodes}}
@cython.internal
cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
"""Non-terminal node for an IntervalTree
Categorizes intervals by those that fall to the left, those that fall to
the right, and those that overlap with the pivot.
"""
cdef readonly:
{{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node
{{dtype}}_t[:] center_left_values, center_right_values, left, right
int64_t[:] center_left_indices, center_right_indices, indices
{{dtype}}_t min_left, max_right
{{dtype}}_t pivot
def __init__(self,
ndarray[{{dtype}}_t, ndim=1] left,
ndarray[{{dtype}}_t, ndim=1] right,
ndarray[int64_t, ndim=1] indices,
int64_t leaf_size):
self.n_elements = len(left)
self.leaf_size = leaf_size
# min_left and min_right are used to speed-up query by skipping
# query on sub-nodes. If this node has size 0, query is cheap,
# so these values don't matter.
if left.size > 0:
self.min_left = left.min()
self.max_right = right.max()
else:
self.min_left = 0
self.max_right = 0
if self.n_elements <= leaf_size:
# make this a terminal (leaf) node
self.is_leaf_node = True
self.left = left
self.right = right
self.indices = indices
self.n_center = 0
else:
# calculate a pivot so we can create child nodes
self.is_leaf_node = False
self.pivot = np.median(left / 2 + right / 2)
left_set, right_set, center_set = self.classify_intervals(
left, right)
self.left_node = self.new_child_node(left, right,
indices, left_set)
self.right_node = self.new_child_node(left, right,
indices, right_set)
self.center_left_values, self.center_left_indices = \
sort_values_and_indices(left, indices, center_set)
self.center_right_values, self.center_right_indices = \
sort_values_and_indices(right, indices, center_set)
self.n_center = len(self.center_left_indices)
@cython.wraparound(False)
@cython.boundscheck(False)
cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right):
"""Classify the given intervals based upon whether they fall to the
left, right, or overlap with this node's pivot.
"""
cdef:
Int64Vector left_ind, right_ind, overlapping_ind
Py_ssize_t i
left_ind = Int64Vector()
right_ind = Int64Vector()
overlapping_ind = Int64Vector()
for i in range(self.n_elements):
if right[i] {{cmp_right_converse}} self.pivot:
left_ind.append(i)
elif self.pivot {{cmp_left_converse}} left[i]:
right_ind.append(i)
else:
overlapping_ind.append(i)
return (left_ind.to_array(),
right_ind.to_array(),
overlapping_ind.to_array())
cdef new_child_node(self,
ndarray[{{dtype}}_t, ndim=1] left,
ndarray[{{dtype}}_t, ndim=1] right,
ndarray[int64_t, ndim=1] indices,
ndarray[int64_t, ndim=1] subset):
"""Create a new child node.
"""
left = take(left, subset)
right = take(right, subset)
indices = take(indices, subset)
return {{dtype_title}}Closed{{closed_title}}IntervalNode(
left, right, indices, self.leaf_size)
@cython.wraparound(False)
@cython.boundscheck(False)
@cython.initializedcheck(False)
cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point):
"""Recursively query this node and its sub-nodes for intervals that
overlap with the query point.
"""
cdef:
int64_t[:] indices
{{dtype}}_t[:] values
Py_ssize_t i
if self.is_leaf_node:
# Once we get down to a certain size, it doesn't make sense to
# continue the binary tree structure. Instead, we use linear
# search.
for i in range(self.n_elements):
if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]:
result.append(self.indices[i])
else:
# There are child nodes. Based on comparing our query to the pivot,
# look at the center values, then go to the relevant child.
if point < self.pivot:
values = self.center_left_values
indices = self.center_left_indices
for i in range(self.n_center):
if not values[i] {{cmp_left}} point:
break
result.append(indices[i])
if point {{cmp_right}} self.left_node.max_right:
self.left_node.query(result, point)
elif point > self.pivot:
values = self.center_right_values
indices = self.center_right_indices
for i in range(self.n_center - 1, -1, -1):
if not point {{cmp_right}} values[i]:
break
result.append(indices[i])
if self.right_node.min_left {{cmp_left}} point:
self.right_node.query(result, point)
else:
result.extend(self.center_left_indices)
NODE_CLASSES['{{dtype}}',
'{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode
{{endfor}}

View File

@@ -0,0 +1,93 @@
import numpy as np
from pandas._typing import npt
def inner_join(
left: np.ndarray, # const intp_t[:]
right: np.ndarray, # const intp_t[:]
max_groups: int,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def left_outer_join(
left: np.ndarray, # const intp_t[:]
right: np.ndarray, # const intp_t[:]
max_groups: int,
sort: bool = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def full_outer_join(
left: np.ndarray, # const intp_t[:]
right: np.ndarray, # const intp_t[:]
max_groups: int,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def ffill_indexer(
indexer: np.ndarray, # const intp_t[:]
) -> npt.NDArray[np.intp]: ...
def left_join_indexer_unique(
left: np.ndarray, # ndarray[join_t]
right: np.ndarray, # ndarray[join_t]
) -> npt.NDArray[np.intp]: ...
def left_join_indexer(
left: np.ndarray, # ndarray[join_t]
right: np.ndarray, # ndarray[join_t]
) -> tuple[
np.ndarray, # np.ndarray[join_t]
npt.NDArray[np.intp],
npt.NDArray[np.intp],
]: ...
def inner_join_indexer(
left: np.ndarray, # ndarray[join_t]
right: np.ndarray, # ndarray[join_t]
) -> tuple[
np.ndarray, # np.ndarray[join_t]
npt.NDArray[np.intp],
npt.NDArray[np.intp],
]: ...
def outer_join_indexer(
left: np.ndarray, # ndarray[join_t]
right: np.ndarray, # ndarray[join_t]
) -> tuple[
np.ndarray, # np.ndarray[join_t]
npt.NDArray[np.intp],
npt.NDArray[np.intp],
]: ...
def asof_join_backward_on_X_by_Y(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
left_by_values: np.ndarray, # by_t[:]
right_by_values: np.ndarray, # by_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def asof_join_forward_on_X_by_Y(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
left_by_values: np.ndarray, # by_t[:]
right_by_values: np.ndarray, # by_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def asof_join_nearest_on_X_by_Y(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
left_by_values: np.ndarray, # by_t[:]
right_by_values: np.ndarray, # by_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def asof_join_backward(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def asof_join_forward(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
def asof_join_nearest(
left_values: np.ndarray, # asof_t[:]
right_values: np.ndarray, # asof_t[:]
allow_exact_matches: bool = ...,
tolerance: np.number | int | float | None = ...,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,129 @@
from cpython.object cimport PyObject
from numpy cimport (
complex64_t,
complex128_t,
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
cdef extern from "khash_python.h":
const int KHASH_TRACE_DOMAIN
ctypedef uint32_t khuint_t
ctypedef khuint_t khiter_t
ctypedef struct khcomplex128_t:
double real
double imag
bint are_equivalent_khcomplex128_t \
"kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil
ctypedef struct khcomplex64_t:
float real
float imag
bint are_equivalent_khcomplex64_t \
"kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil
bint are_equivalent_float64_t \
"kh_floats_hash_equal" (float64_t a, float64_t b) nogil
bint are_equivalent_float32_t \
"kh_floats_hash_equal" (float32_t a, float32_t b) nogil
uint32_t kh_python_hash_func(object key)
bint kh_python_hash_equal(object a, object b)
ctypedef struct kh_pymap_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
PyObject **keys
size_t *vals
kh_pymap_t* kh_init_pymap()
void kh_destroy_pymap(kh_pymap_t*)
void kh_clear_pymap(kh_pymap_t*)
khuint_t kh_get_pymap(kh_pymap_t*, PyObject*)
void kh_resize_pymap(kh_pymap_t*, khuint_t)
khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
void kh_del_pymap(kh_pymap_t*, khuint_t)
bint kh_exist_pymap(kh_pymap_t*, khiter_t)
ctypedef struct kh_pyset_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
PyObject **keys
size_t *vals
kh_pyset_t* kh_init_pyset()
void kh_destroy_pyset(kh_pyset_t*)
void kh_clear_pyset(kh_pyset_t*)
khuint_t kh_get_pyset(kh_pyset_t*, PyObject*)
void kh_resize_pyset(kh_pyset_t*, khuint_t)
khuint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
void kh_del_pyset(kh_pyset_t*, khuint_t)
bint kh_exist_pyset(kh_pyset_t*, khiter_t)
ctypedef char* kh_cstr_t
ctypedef struct kh_str_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
kh_cstr_t *keys
size_t *vals
kh_str_t* kh_init_str() nogil
void kh_destroy_str(kh_str_t*) nogil
void kh_clear_str(kh_str_t*) nogil
khuint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
void kh_resize_str(kh_str_t*, khuint_t) nogil
khuint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
void kh_del_str(kh_str_t*, khuint_t) nogil
bint kh_exist_str(kh_str_t*, khiter_t) nogil
ctypedef struct kh_str_starts_t:
kh_str_t *table
int starts[256]
kh_str_starts_t* kh_init_str_starts() nogil
khuint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key,
int* ret) nogil
khuint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil
void kh_destroy_str_starts(kh_str_starts_t*) nogil
void kh_resize_str_starts(kh_str_starts_t*, khuint_t) nogil
# sweep factorize
ctypedef struct kh_strbox_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
kh_cstr_t *keys
PyObject **vals
kh_strbox_t* kh_init_strbox() nogil
void kh_destroy_strbox(kh_strbox_t*) nogil
void kh_clear_strbox(kh_strbox_t*) nogil
khuint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil
void kh_resize_strbox(kh_strbox_t*, khuint_t) nogil
khuint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil
void kh_del_strbox(kh_strbox_t*, khuint_t) nogil
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
include "khash_for_primitive_helper.pxi"

View File

@@ -0,0 +1,44 @@
"""
Template for wrapping khash-tables for each primitive `dtype`
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
{{py:
# name, c_type
primitive_types = [('int64', 'int64_t'),
('uint64', 'uint64_t'),
('float64', 'float64_t'),
('int32', 'int32_t'),
('uint32', 'uint32_t'),
('float32', 'float32_t'),
('int16', 'int16_t'),
('uint16', 'uint16_t'),
('int8', 'int8_t'),
('uint8', 'uint8_t'),
('complex64', 'khcomplex64_t'),
('complex128', 'khcomplex128_t'),
]
}}
{{for name, c_type in primitive_types}}
cdef extern from "khash_python.h":
ctypedef struct kh_{{name}}_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
{{c_type}} *keys
size_t *vals
kh_{{name}}_t* kh_init_{{name}}() nogil
void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
void kh_clear_{{name}}(kh_{{name}}_t*) nogil
khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil
khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil
bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
{{endfor}}

View File

@@ -0,0 +1,6 @@
from numpy cimport ndarray
cdef bint c_is_list_like(object, bint) except -1
cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)

View File

@@ -0,0 +1,231 @@
# TODO(npdtypes): Many types specified here can be made more specific/accurate;
# the more specific versions are specified in comments
from typing import (
Any,
Callable,
Generator,
Hashable,
Literal,
overload,
)
import numpy as np
from pandas._typing import (
ArrayLike,
DtypeObj,
npt,
)
# placeholder until we can specify np.ndarray[object, ndim=2]
ndarray_obj_2d = np.ndarray
from enum import Enum
class NoDefault(Enum): ...
no_default: NoDefault
i8max: int
u8max: int
def item_from_zerodim(val: object) -> object: ...
def infer_dtype(value: object, skipna: bool = ...) -> str: ...
def is_iterator(obj: object) -> bool: ...
def is_scalar(val: object) -> bool: ...
def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
def is_period(val: object) -> bool: ...
def is_interval(val: object) -> bool: ...
def is_decimal(val: object) -> bool: ...
def is_complex(val: object) -> bool: ...
def is_bool(val: object) -> bool: ...
def is_integer(val: object) -> bool: ...
def is_float(val: object) -> bool: ...
def is_interval_array(values: np.ndarray) -> bool: ...
def is_datetime64_array(values: np.ndarray) -> bool: ...
def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
def is_time_array(values: np.ndarray, skipna: bool = ...): ...
def is_date_array(values: np.ndarray, skipna: bool = ...): ...
def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
def is_string_array(values: np.ndarray, skipna: bool = ...): ...
def is_float_array(values: np.ndarray, skipna: bool = ...): ...
def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ...
def map_infer(
arr: np.ndarray,
f: Callable[[Any], Any],
convert: bool = ...,
ignore_na: bool = ...,
) -> np.ndarray: ...
@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
def maybe_convert_objects(
objects: npt.NDArray[np.object_],
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[False] = ...,
convert_timedelta: bool = ...,
convert_period: Literal[False] = ...,
convert_interval: Literal[False] = ...,
convert_to_nullable_integer: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> np.ndarray: ...
@overload
def maybe_convert_objects(
objects: npt.NDArray[np.object_],
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
def maybe_convert_objects(
objects: npt.NDArray[np.object_],
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[True] = ...,
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
def maybe_convert_objects(
objects: npt.NDArray[np.object_],
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_period: Literal[True] = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
def maybe_convert_objects(
objects: npt.NDArray[np.object_],
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_interval: bool = ...,
convert_to_nullable_integer: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
@overload
def maybe_convert_numeric(
values: npt.NDArray[np.object_],
na_values: set,
convert_empty: bool = ...,
coerce_numeric: bool = ...,
convert_to_masked_nullable: Literal[False] = ...,
) -> tuple[np.ndarray, None]: ...
@overload
def maybe_convert_numeric(
values: npt.NDArray[np.object_],
na_values: set,
convert_empty: bool = ...,
coerce_numeric: bool = ...,
*,
convert_to_masked_nullable: Literal[True],
) -> tuple[np.ndarray, np.ndarray]: ...
# TODO: restrict `arr`?
def ensure_string_array(
arr,
na_value: object = ...,
convert_na_value: bool = ...,
copy: bool = ...,
skipna: bool = ...,
) -> npt.NDArray[np.object_]: ...
def infer_datetimelike_array(
arr: npt.NDArray[np.object_],
) -> tuple[str, bool]: ...
def astype_intsafe(
arr: npt.NDArray[np.object_],
new_dtype: np.dtype,
) -> np.ndarray: ...
def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
# TODO: can we be more specific about rows?
def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ...
def tuples_to_object_array(
tuples: npt.NDArray[np.object_],
) -> ndarray_obj_2d: ...
# TODO: can we be more specific about rows?
def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ...
def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ...
def maybe_booleans_to_slice(
mask: npt.NDArray[np.uint8],
) -> slice | npt.NDArray[np.uint8]: ...
def maybe_indices_to_slice(
indices: npt.NDArray[np.intp],
max_len: int,
) -> slice | npt.NDArray[np.intp]: ...
def is_all_arraylike(obj: list) -> bool: ...
# -----------------------------------------------------------------
# Functions which in reality take memoryviews
def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64
def map_infer_mask(
arr: np.ndarray,
f: Callable[[Any], Any],
mask: np.ndarray, # const uint8_t[:]
convert: bool = ...,
na_value: Any = ...,
dtype: np.dtype = ...,
) -> np.ndarray: ...
def indices_fast(
index: npt.NDArray[np.intp],
labels: np.ndarray, # const int64_t[:]
keys: list,
sorted_labels: list[npt.NDArray[np.int64]],
) -> dict[Hashable, npt.NDArray[np.intp]]: ...
def generate_slices(
labels: np.ndarray, ngroups: int # const intp_t[:]
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
def count_level_2d(
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
labels: np.ndarray, # const intp_t[:]
max_bin: int,
axis: int,
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
def get_level_sorter(
label: np.ndarray, # const int64_t[:]
starts: np.ndarray, # const intp_t[:]
) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1]
def generate_bins_dt64(
values: npt.NDArray[np.int64],
binner: np.ndarray, # const int64_t[:]
closed: object = ...,
hasnans: bool = ...,
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
def array_equivalent_object(
left: np.ndarray, # object[:]
right: np.ndarray, # object[:]
) -> bool: ...
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
) -> npt.NDArray[np.intp]: ...
def is_bool_list(obj: list) -> bool: ...
def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
from numpy cimport (
ndarray,
uint8_t,
)
cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*)
cpdef bint checknull(object val, bint inf_as_na=*)
cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*)
cdef bint is_null_datetime64(v)
cdef bint is_null_timedelta64(v)
cdef bint checknull_with_nat_and_na(object obj)
cdef class C_NAType:
pass
cdef C_NAType C_NA

View File

@@ -0,0 +1,17 @@
import numpy as np
from numpy import typing as npt
class NAType: ...
NA: NAType
def is_matching_na(
left: object, right: object, nan_matches_none: bool = ...
) -> bool: ...
def isposinf_scalar(val: object) -> bool: ...
def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
def isnaobj2d(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...

View File

@@ -0,0 +1,507 @@
from decimal import Decimal
import numbers
from sys import maxsize
import cython
from cython import Py_ssize_t
import numpy as np
cimport numpy as cnp
from numpy cimport (
float64_t,
int64_t,
ndarray,
uint8_t,
)
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.tslibs.nattype cimport (
c_NaT as NaT,
checknull_with_nat,
is_dt64nat,
is_td64nat,
)
from pandas._libs.tslibs.np_datetime cimport (
get_datetime64_unit,
get_datetime64_value,
get_timedelta64_value,
)
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
cdef:
float64_t INF = <float64_t>np.inf
float64_t NEGINF = -INF
int64_t NPY_NAT = util.get_nat()
bint is_32bit = maxsize <= 2 ** 32
type cDecimal = Decimal # for faster isinstance checks
cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
"""
Check if two scalars are both NA of matching types.
Parameters
----------
left : Any
right : Any
nan_matches_none : bool, default False
For backwards compatibility, consider NaN as matching None.
Returns
-------
bool
"""
if left is None:
if nan_matches_none and util.is_nan(right):
return True
return right is None
elif left is C_NA:
return right is C_NA
elif left is NaT:
return right is NaT
elif util.is_float_object(left):
if nan_matches_none and right is None and util.is_nan(left):
return True
return (
util.is_nan(left)
and util.is_float_object(right)
and util.is_nan(right)
)
elif util.is_complex_object(left):
return (
util.is_nan(left)
and util.is_complex_object(right)
and util.is_nan(right)
)
elif util.is_datetime64_object(left):
return (
get_datetime64_value(left) == NPY_NAT
and util.is_datetime64_object(right)
and get_datetime64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
elif util.is_timedelta64_object(left):
return (
get_timedelta64_value(left) == NPY_NAT
and util.is_timedelta64_object(right)
and get_timedelta64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
elif is_decimal_na(left):
return is_decimal_na(right)
return False
cpdef bint checknull(object val, bint inf_as_na=False):
"""
Return boolean describing of the input is NA-like, defined here as any
of:
- None
- nan
- NaT
- np.datetime64 representation of NaT
- np.timedelta64 representation of NaT
- NA
- Decimal("NaN")
Parameters
----------
val : object
inf_as_na : bool, default False
Whether to treat INF and -INF as NA values.
Returns
-------
bool
"""
if val is None or val is NaT or val is C_NA:
return True
elif util.is_float_object(val) or util.is_complex_object(val):
if val != val:
return True
elif inf_as_na:
return val == INF or val == NEGINF
return False
elif util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
else:
return is_decimal_na(val)
cdef inline bint is_decimal_na(object val):
"""
Is this a decimal.Decimal object Decimal("NAN").
"""
return isinstance(val, cDecimal) and val != val
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
"""
Return boolean mask denoting which elements of a 1-D array are na-like,
according to the criteria defined in `checknull`:
- None
- nan
- NaT
- np.datetime64 representation of NaT
- np.timedelta64 representation of NaT
- NA
- Decimal("NaN")
Parameters
----------
arr : ndarray
Returns
-------
result : ndarray (dtype=np.bool_)
"""
cdef:
Py_ssize_t i, n
object val
ndarray[uint8_t] result
assert arr.ndim == 1, "'arr' must be 1-D."
n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i in range(n):
val = arr[i]
result[i] = checknull(val, inf_as_na=inf_as_na)
return result.view(np.bool_)
@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj2d(arr: ndarray, inf_as_na: bool = False) -> ndarray:
"""
Return boolean mask denoting which elements of a 2-D array are na-like,
according to the criteria defined in `checknull`:
- None
- nan
- NaT
- np.datetime64 representation of NaT
- np.timedelta64 representation of NaT
- NA
- Decimal("NaN")
Parameters
----------
arr : ndarray
Returns
-------
result : ndarray (dtype=np.bool_)
"""
cdef:
Py_ssize_t i, j, n, m
object val
ndarray[uint8_t, ndim=2] result
assert arr.ndim == 2, "'arr' must be 2-D."
n, m = (<object>arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i in range(n):
for j in range(m):
val = arr[i, j]
if checknull(val, inf_as_na=inf_as_na):
result[i, j] = 1
return result.view(np.bool_)
def isposinf_scalar(val: object) -> bool:
return util.is_float_object(val) and val == INF
def isneginf_scalar(val: object) -> bool:
return util.is_float_object(val) and val == NEGINF
cdef inline bint is_null_datetime64(v):
# determine if we have a null for a datetime (or integer versions),
# excluding np.timedelta64('nat')
if checknull_with_nat(v) or is_dt64nat(v):
return True
return False
cdef inline bint is_null_timedelta64(v):
# determine if we have a null for a timedelta (or integer versions),
# excluding np.datetime64('nat')
if checknull_with_nat(v) or is_td64nat(v):
return True
return False
cdef bint checknull_with_nat_and_na(object obj):
# See GH#32214
return checknull_with_nat(obj) or obj is C_NA
@cython.wraparound(False)
@cython.boundscheck(False)
def is_float_nan(values: ndarray) -> ndarray:
"""
True for elements which correspond to a float nan
Returns
-------
ndarray[bool]
"""
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val
N = len(values)
result = np.zeros(N, dtype=np.uint8)
for i in range(N):
val = values[i]
if util.is_nan(val):
result[i] = True
return result.view(bool)
@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
"""
Check for NA values consistent with IntegerArray/FloatingArray.
Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
Returns
-------
ndarray[bool]
"""
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val
N = len(values)
result = np.zeros(N, dtype=np.uint8)
for i in range(N):
val = values[i]
if checknull(val):
if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
result[i] = True
else:
raise TypeError(f"'values' contains non-numeric NA {val}")
return result.view(bool)
# -----------------------------------------------------------------------------
# Implementation of NA singleton
def _create_binary_propagating_op(name, is_divmod=False):
def method(self, other):
if (other is C_NA or isinstance(other, str)
or isinstance(other, (numbers.Number, np.bool_))
or isinstance(other, np.ndarray) and not other.shape):
# Need the other.shape clause to handle NumPy scalars,
# since we do a setitem on `out` below, which
# won't work for NumPy scalars.
if is_divmod:
return NA, NA
else:
return NA
elif isinstance(other, np.ndarray):
out = np.empty(other.shape, dtype=object)
out[:] = NA
if is_divmod:
return out, out.copy()
else:
return out
return NotImplemented
method.__name__ = name
return method
def _create_unary_propagating_op(name: str):
def method(self):
return NA
method.__name__ = name
return method
cdef class C_NAType:
pass
class NAType(C_NAType):
"""
NA ("not available") missing value indicator.
.. warning::
Experimental: the behaviour of NA can still change without warning.
.. versionadded:: 1.0.0
The NA singleton is a missing value indicator defined by pandas. It is
used in certain new extension dtypes (currently the "string" dtype).
"""
_instance = None
def __new__(cls, *args, **kwargs):
if NAType._instance is None:
NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
return NAType._instance
def __repr__(self) -> str:
return "<NA>"
def __format__(self, format_spec) -> str:
try:
return self.__repr__().__format__(format_spec)
except ValueError:
return self.__repr__()
def __bool__(self):
raise TypeError("boolean value of NA is ambiguous")
def __hash__(self):
# GH 30013: Ensure hash is large enough to avoid hash collisions with integers
exponent = 31 if is_32bit else 61
return 2 ** exponent - 1
def __reduce__(self):
return "NA"
# Binary arithmetic and comparison ops -> propagate
__add__ = _create_binary_propagating_op("__add__")
__radd__ = _create_binary_propagating_op("__radd__")
__sub__ = _create_binary_propagating_op("__sub__")
__rsub__ = _create_binary_propagating_op("__rsub__")
__mul__ = _create_binary_propagating_op("__mul__")
__rmul__ = _create_binary_propagating_op("__rmul__")
__matmul__ = _create_binary_propagating_op("__matmul__")
__rmatmul__ = _create_binary_propagating_op("__rmatmul__")
__truediv__ = _create_binary_propagating_op("__truediv__")
__rtruediv__ = _create_binary_propagating_op("__rtruediv__")
__floordiv__ = _create_binary_propagating_op("__floordiv__")
__rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
__mod__ = _create_binary_propagating_op("__mod__")
__rmod__ = _create_binary_propagating_op("__rmod__")
__divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
__rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
# __lshift__ and __rshift__ are not implemented
__eq__ = _create_binary_propagating_op("__eq__")
__ne__ = _create_binary_propagating_op("__ne__")
__le__ = _create_binary_propagating_op("__le__")
__lt__ = _create_binary_propagating_op("__lt__")
__gt__ = _create_binary_propagating_op("__gt__")
__ge__ = _create_binary_propagating_op("__ge__")
# Unary ops
__neg__ = _create_unary_propagating_op("__neg__")
__pos__ = _create_unary_propagating_op("__pos__")
__abs__ = _create_unary_propagating_op("__abs__")
__invert__ = _create_unary_propagating_op("__invert__")
# pow has special
def __pow__(self, other):
if other is C_NA:
return NA
elif isinstance(other, (numbers.Number, np.bool_)):
if other == 0:
# returning positive is correct for +/- 0.
return type(other)(1)
else:
return NA
elif isinstance(other, np.ndarray):
return np.where(other == 0, other.dtype.type(1), NA)
return NotImplemented
def __rpow__(self, other):
if other is C_NA:
return NA
elif isinstance(other, (numbers.Number, np.bool_)):
if other == 1:
return other
else:
return NA
elif isinstance(other, np.ndarray):
return np.where(other == 1, other, NA)
return NotImplemented
# Logical ops using Kleene logic
def __and__(self, other):
if other is False:
return False
elif other is True or other is C_NA:
return NA
return NotImplemented
__rand__ = __and__
def __or__(self, other):
if other is True:
return True
elif other is False or other is C_NA:
return NA
return NotImplemented
__ror__ = __or__
def __xor__(self, other):
if other is False or other is True or other is C_NA:
return NA
return NotImplemented
__rxor__ = __xor__
__array_priority__ = 1000
_HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
types = self._HANDLED_TYPES + (NAType,)
for x in inputs:
if not isinstance(x, types):
return NotImplemented
if method != "__call__":
raise ValueError(f"ufunc method '{method}' not supported for NA")
result = maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is NotImplemented:
# For a NumPy ufunc that's not a binop, like np.logaddexp
index = [i for i, x in enumerate(inputs) if x is NA][0]
result = np.broadcast_arrays(*inputs)[index]
if result.ndim == 0:
result = result.item()
if ufunc.nout > 1:
result = (NA,) * ufunc.nout
return result
C_NA = NAType() # C-visible
NA = C_NA # Python-visible

View File

@@ -0,0 +1,50 @@
from typing import (
Any,
Callable,
Iterable,
Literal,
overload,
)
import numpy as np
from pandas._typing import npt
_BinOp = Callable[[Any, Any], Any]
_BoolOp = Callable[[Any, Any], bool]
def scalar_compare(
values: np.ndarray, # object[:]
val: object,
op: _BoolOp, # {operator.eq, operator.ne, ...}
) -> npt.NDArray[np.bool_]: ...
def vec_compare(
left: npt.NDArray[np.object_],
right: npt.NDArray[np.object_],
op: _BoolOp, # {operator.eq, operator.ne, ...}
) -> npt.NDArray[np.bool_]: ...
def scalar_binop(
values: np.ndarray, # object[:]
val: object,
op: _BinOp, # binary operator
) -> np.ndarray: ...
def vec_binop(
left: np.ndarray, # object[:]
right: np.ndarray, # object[:]
op: _BinOp, # binary operator
) -> np.ndarray: ...
@overload
def maybe_convert_bool(
arr: npt.NDArray[np.object_],
true_values: Iterable = ...,
false_values: Iterable = ...,
convert_to_masked_nullable: Literal[False] = ...,
) -> tuple[np.ndarray, None]: ...
@overload
def maybe_convert_bool(
arr: npt.NDArray[np.object_],
true_values: Iterable = ...,
false_values: Iterable = ...,
*,
convert_to_masked_nullable: Literal[True],
) -> tuple[np.ndarray, np.ndarray]: ...

View File

@@ -0,0 +1,310 @@
import operator
from cpython.object cimport (
Py_EQ,
Py_GE,
Py_GT,
Py_LE,
Py_LT,
Py_NE,
PyObject_RichCompareBool,
)
import cython
from cython import Py_ssize_t
import numpy as np
from numpy cimport (
import_array,
ndarray,
uint8_t,
)
import_array()
from pandas._libs.missing cimport checknull
from pandas._libs.util cimport is_nan
@cython.wraparound(False)
@cython.boundscheck(False)
def scalar_compare(object[:] values, object val, object op) -> ndarray:
"""
Compare each element of `values` array with the scalar `val`, with
the comparison operation described by `op`.
Parameters
----------
values : ndarray[object]
val : object
op : {operator.eq, operator.ne,
operator.le, operator.lt,
operator.ge, operator.gt}
Returns
-------
result : ndarray[bool]
"""
cdef:
Py_ssize_t i, n = len(values)
ndarray[uint8_t, cast=True] result
bint isnull_val
int flag
object x
if op is operator.lt:
flag = Py_LT
elif op is operator.le:
flag = Py_LE
elif op is operator.gt:
flag = Py_GT
elif op is operator.ge:
flag = Py_GE
elif op is operator.eq:
flag = Py_EQ
elif op is operator.ne:
flag = Py_NE
else:
raise ValueError('Unrecognized operator')
result = np.empty(n, dtype=bool).view(np.uint8)
isnull_val = checknull(val)
if flag == Py_NE:
for i in range(n):
x = values[i]
if checknull(x):
result[i] = True
elif isnull_val:
result[i] = True
else:
try:
result[i] = PyObject_RichCompareBool(x, val, flag)
except TypeError:
result[i] = True
elif flag == Py_EQ:
for i in range(n):
x = values[i]
if checknull(x):
result[i] = False
elif isnull_val:
result[i] = False
else:
try:
result[i] = PyObject_RichCompareBool(x, val, flag)
except TypeError:
result[i] = False
else:
for i in range(n):
x = values[i]
if checknull(x):
result[i] = False
elif isnull_val:
result[i] = False
else:
result[i] = PyObject_RichCompareBool(x, val, flag)
return result.view(bool)
@cython.wraparound(False)
@cython.boundscheck(False)
def vec_compare(ndarray[object] left, ndarray[object] right, object op) -> ndarray:
"""
Compare the elements of `left` with the elements of `right` pointwise,
with the comparison operation described by `op`.
Parameters
----------
left : ndarray[object]
right : ndarray[object]
op : {operator.eq, operator.ne,
operator.le, operator.lt,
operator.ge, operator.gt}
Returns
-------
result : ndarray[bool]
"""
cdef:
Py_ssize_t i, n = len(left)
ndarray[uint8_t, cast=True] result
int flag
if n != <Py_ssize_t>len(right):
raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}')
if op is operator.lt:
flag = Py_LT
elif op is operator.le:
flag = Py_LE
elif op is operator.gt:
flag = Py_GT
elif op is operator.ge:
flag = Py_GE
elif op is operator.eq:
flag = Py_EQ
elif op is operator.ne:
flag = Py_NE
else:
raise ValueError('Unrecognized operator')
result = np.empty(n, dtype=bool).view(np.uint8)
if flag == Py_NE:
for i in range(n):
x = left[i]
y = right[i]
if checknull(x) or checknull(y):
result[i] = True
else:
result[i] = PyObject_RichCompareBool(x, y, flag)
else:
for i in range(n):
x = left[i]
y = right[i]
if checknull(x) or checknull(y):
result[i] = False
else:
result[i] = PyObject_RichCompareBool(x, y, flag)
return result.view(bool)
@cython.wraparound(False)
@cython.boundscheck(False)
def scalar_binop(object[:] values, object val, object op) -> ndarray:
"""
Apply the given binary operator `op` between each element of the array
`values` and the scalar `val`.
Parameters
----------
values : ndarray[object]
val : object
op : binary operator
Returns
-------
result : ndarray[object]
"""
cdef:
Py_ssize_t i, n = len(values)
object[:] result
object x
result = np.empty(n, dtype=object)
if val is None or is_nan(val):
result[:] = val
return result.base # `.base` to access underlying np.ndarray
for i in range(n):
x = values[i]
if x is None or is_nan(x):
result[i] = x
else:
result[i] = op(x, val)
return maybe_convert_bool(result.base)[0]
@cython.wraparound(False)
@cython.boundscheck(False)
def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
"""
Apply the given binary operator `op` pointwise to the elements of
arrays `left` and `right`.
Parameters
----------
left : ndarray[object]
right : ndarray[object]
op : binary operator
Returns
-------
result : ndarray[object]
"""
cdef:
Py_ssize_t i, n = len(left)
object[:] result
if n != <Py_ssize_t>len(right):
raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}')
result = np.empty(n, dtype=object)
for i in range(n):
x = left[i]
y = right[i]
try:
result[i] = op(x, y)
except TypeError:
if x is None or is_nan(x):
result[i] = x
elif y is None or is_nan(y):
result[i] = y
else:
raise
return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray
def maybe_convert_bool(ndarray[object] arr,
true_values=None,
false_values=None,
convert_to_masked_nullable=False
) -> tuple[np.ndarray, np.ndarray | None]:
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
ndarray[uint8_t] mask
object val
set true_vals, false_vals
bint has_na = False
n = len(arr)
result = np.empty(n, dtype=np.uint8)
mask = np.zeros(n, dtype=np.uint8)
# the defaults
true_vals = {'True', 'TRUE', 'true'}
false_vals = {'False', 'FALSE', 'false'}
if true_values is not None:
true_vals = true_vals | set(true_values)
if false_values is not None:
false_vals = false_vals | set(false_values)
for i in range(n):
val = arr[i]
if isinstance(val, bool):
if val is True:
result[i] = 1
else:
result[i] = 0
elif val in true_vals:
result[i] = 1
elif val in false_vals:
result[i] = 0
elif is_nan(val):
mask[i] = 1
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
has_na = True
else:
return (arr, None)
if has_na:
if convert_to_masked_nullable:
return (result.view(np.bool_), mask.view(np.bool_))
else:
arr = result.view(np.bool_).astype(object)
np.putmask(arr, mask, np.nan)
return (arr, None)
else:
return (result.view(np.bool_), None)

View File

@@ -0,0 +1,5 @@
import numpy as np
def maybe_dispatch_ufunc_to_dunder_op(
self, ufunc: np.ufunc, method: str, *inputs, **kwargs
): ...

View File

@@ -0,0 +1,121 @@
DISPATCHED_UFUNCS = {
"add",
"sub",
"mul",
"pow",
"mod",
"floordiv",
"truediv",
"divmod",
"eq",
"ne",
"lt",
"gt",
"le",
"ge",
"remainder",
"matmul",
"or",
"xor",
"and",
"neg",
"pos",
"abs",
}
UNARY_UFUNCS = {
"neg",
"pos",
"abs",
}
UFUNC_ALIASES = {
"subtract": "sub",
"multiply": "mul",
"floor_divide": "floordiv",
"true_divide": "truediv",
"power": "pow",
"remainder": "mod",
"divide": "truediv",
"equal": "eq",
"not_equal": "ne",
"less": "lt",
"less_equal": "le",
"greater": "gt",
"greater_equal": "ge",
"bitwise_or": "or",
"bitwise_and": "and",
"bitwise_xor": "xor",
"negative": "neg",
"absolute": "abs",
"positive": "pos",
}
# For op(., Array) -> Array.__r{op}__
REVERSED_NAMES = {
"lt": "__gt__",
"le": "__ge__",
"gt": "__lt__",
"ge": "__le__",
"eq": "__eq__",
"ne": "__ne__",
}
def maybe_dispatch_ufunc_to_dunder_op(
object self, object ufunc, str method, *inputs, **kwargs
):
"""
Dispatch a ufunc to the equivalent dunder method.
Parameters
----------
self : ArrayLike
The array whose dunder method we dispatch to
ufunc : Callable
A NumPy ufunc
method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'}
inputs : ArrayLike
The input arrays.
kwargs : Any
The additional keyword arguments, e.g. ``out``.
Returns
-------
result : Any
The result of applying the ufunc
"""
# special has the ufuncs we dispatch to the dunder op on
op_name = ufunc.__name__
op_name = UFUNC_ALIASES.get(op_name, op_name)
def not_implemented(*args, **kwargs):
return NotImplemented
if kwargs or ufunc.nin > 2:
return NotImplemented
if method == "__call__" and op_name in DISPATCHED_UFUNCS:
if inputs[0] is self:
name = f"__{op_name}__"
meth = getattr(self, name, not_implemented)
if op_name in UNARY_UFUNCS:
assert len(inputs) == 1
return meth()
return meth(inputs[1])
elif inputs[1] is self:
name = REVERSED_NAMES.get(op_name, f"__r{op_name}__")
meth = getattr(self, name, not_implemented)
result = meth(inputs[0])
return result
else:
# should not be reached, but covering our bases
return NotImplemented
else:
return NotImplemented

View File

@@ -0,0 +1,71 @@
from typing import (
Hashable,
Literal,
)
import numpy as np
from pandas._typing import (
ArrayLike,
Dtype,
npt,
)
STR_NA_VALUES: set[str]
def sanitize_objects(
values: npt.NDArray[np.object_],
na_values: set,
) -> int: ...
class TextReader:
unnamed_cols: set[str]
table_width: int # int64_t
leading_cols: int # int64_t
header: list[list[int]] # non-negative integers
def __init__(
self,
source,
delimiter: bytes | str = ..., # single-character only
header=...,
header_start: int = ..., # int64_t
header_end: int = ..., # uint64_t
index_col=...,
names=...,
tokenize_chunksize: int = ..., # int64_t
delim_whitespace: bool = ...,
converters=...,
skipinitialspace: bool = ...,
escapechar: bytes | str | None = ..., # single-character only
doublequote: bool = ...,
quotechar: str | bytes | None = ..., # at most 1 character
quoting: int = ...,
lineterminator: bytes | str | None = ..., # at most 1 character
comment=...,
decimal: bytes | str = ..., # single-character only
thousands: bytes | str | None = ..., # single-character only
dtype: Dtype | dict[Hashable, Dtype] = ...,
usecols=...,
error_bad_lines: bool = ...,
warn_bad_lines: bool = ...,
na_filter: bool = ...,
na_values=...,
na_fvalues=...,
keep_default_na: bool = ...,
true_values=...,
false_values=...,
allow_leading_cols: bool = ...,
skiprows=...,
skipfooter: int = ..., # int64_t
verbose: bool = ...,
mangle_dupe_cols: bool = ...,
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
skip_blank_lines: bool = ...,
encoding_errors: bytes | str = ...,
): ...
def set_error_bad_lines(self, status: int) -> None: ...
def set_noconvert(self, i: int) -> None: ...
def remove_noconvert(self, i: int) -> None: ...
def close(self) -> None: ...
def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,9 @@
# pyright: reportIncompleteStub = false
from typing import Any
# note: this is a lie to make type checkers happy (they special
# case property). cache_readonly uses attribute names similar to
# property (fget) but it does not provide fset and fdel.
cache_readonly = property
def __getattr__(name: str) -> Any: ... # incomplete

View File

@@ -0,0 +1,70 @@
from cython import Py_ssize_t
from cpython.dict cimport (
PyDict_Contains,
PyDict_GetItem,
PyDict_SetItem,
)
cdef class CachedProperty:
cdef readonly:
object fget, name, __doc__
def __init__(self, fget):
self.fget = fget
self.name = fget.__name__
self.__doc__ = getattr(fget, '__doc__', None)
def __get__(self, obj, typ):
if obj is None:
# accessed on the class, not the instance
return self
# Get the cache or set a default one if needed
cache = getattr(obj, '_cache', None)
if cache is None:
try:
cache = obj._cache = {}
except (AttributeError):
return self
if PyDict_Contains(cache, self.name):
# not necessary to Py_INCREF
val = <object>PyDict_GetItem(cache, self.name)
else:
val = self.fget(obj)
PyDict_SetItem(cache, self.name, val)
return val
def __set__(self, obj, value):
raise AttributeError("Can't set attribute")
cache_readonly = CachedProperty
cdef class AxisProperty:
cdef readonly:
Py_ssize_t axis
object __doc__
def __init__(self, axis=0, doc=""):
self.axis = axis
self.__doc__ = doc
def __get__(self, obj, type):
cdef:
list axes
if obj is None:
# Only instances have _mgr, not classes
return self
else:
axes = obj._mgr.axes
return axes[self.axis]
def __set__(self, obj, value):
obj._set_axis(self.axis, value)

View File

@@ -0,0 +1,33 @@
import numpy as np
cimport numpy as cnp
cnp.import_array()
from pandas._libs.util cimport is_array
cdef cnp.dtype _dtype_obj = np.dtype("object")
cpdef check_result_array(object obj, object dtype):
# Our operation is supposed to be an aggregation/reduction. If
# it returns an ndarray, this likely means an invalid operation has
# been passed. See test_apply_without_aggregation, test_agg_must_agg
if is_array(obj):
if dtype != _dtype_obj:
# If it is object dtype, the function can be a reduction/aggregation
# and still return an ndarray e.g. test_agg_over_numpy_arrays
raise ValueError("Must produce aggregated value")
cpdef inline extract_result(object res):
""" extract the result object, it might be a 0-dim ndarray
or a len-1 0-dim, or a scalar """
if hasattr(res, "_values"):
# Preserve EA
res = res._values
if res.ndim == 1 and len(res) == 1:
# see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply
res = res[0]
return res

View File

@@ -0,0 +1,16 @@
import numpy as np
from pandas._typing import npt
def unstack(
values: np.ndarray, # reshape_t[:, :]
mask: np.ndarray, # const uint8_t[:]
stride: int,
length: int,
width: int,
new_values: np.ndarray, # reshape_t[:, :]
new_mask: np.ndarray, # uint8_t[:, :]
) -> None: ...
def explode(
values: npt.NDArray[np.object_],
) -> tuple[npt.NDArray[np.object_], npt.NDArray[np.int64]]: ...

View File

@@ -0,0 +1,139 @@
import cython
from cython import Py_ssize_t
from numpy cimport (
int64_t,
ndarray,
uint8_t,
)
import numpy as np
cimport numpy as cnp
cnp.import_array()
from pandas._libs.dtypes cimport numeric_object_t
from pandas._libs.lib cimport c_is_list_like
@cython.wraparound(False)
@cython.boundscheck(False)
def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask,
Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width,
numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None:
"""
Transform long values to wide new_values.
Parameters
----------
values : typed ndarray
mask : np.ndarray[bool]
stride : int
length : int
width : int
new_values : np.ndarray[bool]
result array
new_mask : np.ndarray[bool]
result mask
"""
cdef:
Py_ssize_t i, j, w, nulls, s, offset
if numeric_object_t is not object:
# evaluated at compile-time
with nogil:
for i in range(stride):
nulls = 0
for j in range(length):
for w in range(width):
offset = j * width + w
if mask[offset]:
s = i * width + w
new_values[j, s] = values[offset - nulls, i]
new_mask[j, s] = 1
else:
nulls += 1
else:
# object-dtype, identical to above but we cannot use nogil
for i in range(stride):
nulls = 0
for j in range(length):
for w in range(width):
offset = j * width + w
if mask[offset]:
s = i * width + w
new_values[j, s] = values[offset - nulls, i]
new_mask[j, s] = 1
else:
nulls += 1
@cython.wraparound(False)
@cython.boundscheck(False)
def explode(ndarray[object] values):
"""
transform array list-likes to long form
preserve non-list entries
Parameters
----------
values : ndarray[object]
Returns
-------
ndarray[object]
result
ndarray[int64_t]
counts
"""
cdef:
Py_ssize_t i, j, count, n
object v
ndarray[object] result
ndarray[int64_t] counts
# find the resulting len
n = len(values)
counts = np.zeros(n, dtype='int64')
for i in range(n):
v = values[i]
if c_is_list_like(v, True):
if len(v):
counts[i] += len(v)
else:
# empty list-like, use a nan marker
counts[i] += 1
else:
counts[i] += 1
result = np.empty(counts.sum(), dtype='object')
count = 0
for i in range(n):
v = values[i]
if c_is_list_like(v, True):
if len(v):
v = list(v)
for j in range(len(v)):
result[count] = v[j]
count += 1
else:
# empty list-like, use a nan marker
result[count] = np.nan
count += 1
else:
# replace with the existing scalar
result[count] = v
count += 1
return result, counts

View File

@@ -0,0 +1,47 @@
from typing import (
Sequence,
TypeVar,
)
import numpy as np
from pandas._typing import npt
SparseIndexT = TypeVar("SparseIndexT", bound="SparseIndex")
class SparseIndex:
length: int
npoints: int
def __init__(self): ...
@property
def ngaps(self) -> int: ...
@property
def nbytes(self) -> int: ...
@property
def indices(self) -> npt.NDArray[np.int32]: ...
def equals(self, other) -> bool: ...
def lookup(self, index: int) -> np.int32: ...
def lookup_array(self, indexer: npt.NDArray[np.int32]) -> npt.NDArray[np.int32]: ...
def to_int_index(self) -> IntIndex: ...
def to_block_index(self) -> BlockIndex: ...
def intersect(self: SparseIndexT, y_: SparseIndex) -> SparseIndexT: ...
def make_union(self: SparseIndexT, y_: SparseIndex) -> SparseIndexT: ...
class IntIndex(SparseIndex):
indices: npt.NDArray[np.int32]
def __init__(
self, length: int, indices: Sequence[int], check_integrity: bool = ...
): ...
class BlockIndex(SparseIndex):
nblocks: int
blocs: np.ndarray
blengths: np.ndarray
def __init__(self, length: int, blocs: np.ndarray, blengths: np.ndarray): ...
def make_mask_object_ndarray(
arr: npt.NDArray[np.object_], fill_value
) -> npt.NDArray[np.bool_]: ...
def get_blocks(
indices: npt.NDArray[np.int32],
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]: ...

View File

@@ -0,0 +1,738 @@
import cython
import numpy as np
cimport numpy as cnp
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
ndarray,
uint8_t,
)
cnp.import_array()
# -----------------------------------------------------------------------------
# Preamble stuff
cdef float64_t NaN = <float64_t>np.NaN
cdef float64_t INF = <float64_t>np.inf
# -----------------------------------------------------------------------------
cdef class SparseIndex:
"""
Abstract superclass for sparse index types.
"""
def __init__(self):
raise NotImplementedError
cdef class IntIndex(SparseIndex):
"""
Object for holding exact integer sparse indexing information
Parameters
----------
length : integer
indices : array-like
Contains integers corresponding to the indices.
check_integrity : bool, default=True
Check integrity of the input.
"""
cdef readonly:
Py_ssize_t length, npoints
ndarray indices
def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
self.length = length
self.indices = np.ascontiguousarray(indices, dtype=np.int32)
self.npoints = len(self.indices)
if check_integrity:
self.check_integrity()
def __reduce__(self):
args = (self.length, self.indices)
return IntIndex, args
def __repr__(self) -> str:
output = 'IntIndex\n'
output += f'Indices: {repr(self.indices)}\n'
return output
@property
def nbytes(self) -> int:
return self.indices.nbytes
cdef check_integrity(self):
"""
Checks the following:
- Indices are strictly ascending
- Number of indices is at most self.length
- Indices are at least 0 and at most the total length less one
A ValueError is raised if any of these conditions is violated.
"""
if self.npoints > self.length:
raise ValueError(
f"Too many indices. Expected {self.length} but found {self.npoints}"
)
# Indices are vacuously ordered and non-negative
# if the sequence of indices is empty.
if self.npoints == 0:
return
if self.indices.min() < 0:
raise ValueError("No index can be less than zero")
if self.indices.max() >= self.length:
raise ValueError("All indices must be less than the length")
monotonic = np.all(self.indices[:-1] < self.indices[1:])
if not monotonic:
raise ValueError("Indices must be strictly increasing")
def equals(self, other: object) -> bool:
if not isinstance(other, IntIndex):
return False
if self is other:
return True
same_length = self.length == other.length
same_indices = np.array_equal(self.indices, other.indices)
return same_length and same_indices
@property
def ngaps(self) -> int:
return self.length - self.npoints
cpdef to_int_index(self):
return self
def to_block_index(self):
locs, lens = get_blocks(self.indices)
return BlockIndex(self.length, locs, lens)
cpdef IntIndex intersect(self, SparseIndex y_):
cdef:
Py_ssize_t out_length, xi, yi = 0, result_indexer = 0
int32_t xind
ndarray[int32_t, ndim=1] xindices, yindices, new_indices
IntIndex y
# if is one already, returns self
y = y_.to_int_index()
if self.length != y.length:
raise Exception('Indices must reference same underlying length')
xindices = self.indices
yindices = y.indices
new_indices = np.empty(min(
len(xindices), len(yindices)), dtype=np.int32)
for xi in range(self.npoints):
xind = xindices[xi]
while yi < y.npoints and yindices[yi] < xind:
yi += 1
if yi >= y.npoints:
break
# TODO: would a two-pass algorithm be faster?
if yindices[yi] == xind:
new_indices[result_indexer] = xind
result_indexer += 1
new_indices = new_indices[:result_indexer]
return IntIndex(self.length, new_indices)
cpdef IntIndex make_union(self, SparseIndex y_):
cdef:
ndarray[int32_t, ndim=1] new_indices
IntIndex y
# if is one already, returns self
y = y_.to_int_index()
if self.length != y.length:
raise ValueError('Indices must reference same underlying length')
new_indices = np.union1d(self.indices, y.indices)
return IntIndex(self.length, new_indices)
@cython.wraparound(False)
cpdef int32_t lookup(self, Py_ssize_t index):
"""
Return the internal location if value exists on given index.
Return -1 otherwise.
"""
cdef:
int32_t res
ndarray[int32_t, ndim=1] inds
inds = self.indices
if self.npoints == 0:
return -1
elif index < 0 or self.length <= index:
return -1
res = inds.searchsorted(index)
if res == self.npoints:
return -1
elif inds[res] == index:
return res
else:
return -1
@cython.wraparound(False)
cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
"""
Vectorized lookup, returns ndarray[int32_t]
"""
cdef:
Py_ssize_t n, i, ind_val
ndarray[int32_t, ndim=1] inds
ndarray[uint8_t, ndim=1, cast=True] mask
ndarray[int32_t, ndim=1] masked
ndarray[int32_t, ndim=1] res
ndarray[int32_t, ndim=1] results
n = len(indexer)
results = np.empty(n, dtype=np.int32)
results[:] = -1
if self.npoints == 0:
return results
inds = self.indices
mask = (inds[0] <= indexer) & (indexer <= inds[len(inds) - 1])
masked = indexer[mask]
res = inds.searchsorted(masked).astype(np.int32)
res[inds[res] != masked] = -1
results[mask] = res
return results
cpdef get_blocks(ndarray[int32_t, ndim=1] indices):
cdef:
Py_ssize_t init_len, i, npoints, result_indexer = 0
int32_t block, length = 1, cur, prev
ndarray[int32_t, ndim=1] locs, lens
npoints = len(indices)
# just handle the special empty case separately
if npoints == 0:
return np.array([], dtype=np.int32), np.array([], dtype=np.int32)
# block size can't be longer than npoints
locs = np.empty(npoints, dtype=np.int32)
lens = np.empty(npoints, dtype=np.int32)
# TODO: two-pass algorithm faster?
prev = block = indices[0]
for i in range(1, npoints):
cur = indices[i]
if cur - prev > 1:
# new block
locs[result_indexer] = block
lens[result_indexer] = length
block = cur
length = 1
result_indexer += 1
else:
# same block, increment length
length += 1
prev = cur
locs[result_indexer] = block
lens[result_indexer] = length
result_indexer += 1
locs = locs[:result_indexer]
lens = lens[:result_indexer]
return locs, lens
# -----------------------------------------------------------------------------
# BlockIndex
cdef class BlockIndex(SparseIndex):
"""
Object for holding block-based sparse indexing information
Parameters
----------
"""
cdef readonly:
int32_t nblocks, npoints, length
ndarray blocs, blengths
cdef:
object __weakref__ # need to be picklable
int32_t *locbuf
int32_t *lenbuf
def __init__(self, length, blocs, blengths):
self.blocs = np.ascontiguousarray(blocs, dtype=np.int32)
self.blengths = np.ascontiguousarray(blengths, dtype=np.int32)
# in case we need
self.locbuf = <int32_t*>self.blocs.data
self.lenbuf = <int32_t*>self.blengths.data
self.length = length
self.nblocks = np.int32(len(self.blocs))
self.npoints = self.blengths.sum()
# self.block_start = blocs
# self.block_end = blocs + blengths
self.check_integrity()
def __reduce__(self):
args = (self.length, self.blocs, self.blengths)
return BlockIndex, args
def __repr__(self) -> str:
output = 'BlockIndex\n'
output += f'Block locations: {repr(self.blocs)}\n'
output += f'Block lengths: {repr(self.blengths)}'
return output
@property
def nbytes(self) -> int:
return self.blocs.nbytes + self.blengths.nbytes
@property
def ngaps(self) -> int:
return self.length - self.npoints
cdef check_integrity(self):
"""
Check:
- Locations are in ascending order
- No overlapping blocks
- Blocks to not start after end of index, nor extend beyond end
"""
cdef:
Py_ssize_t i
ndarray[int32_t, ndim=1] blocs, blengths
blocs = self.blocs
blengths = self.blengths
if len(blocs) != len(blengths):
raise ValueError('block bound arrays must be same length')
for i in range(self.nblocks):
if i > 0:
if blocs[i] <= blocs[i - 1]:
raise ValueError('Locations not in ascending order')
if i < self.nblocks - 1:
if blocs[i] + blengths[i] > blocs[i + 1]:
raise ValueError(f'Block {i} overlaps')
else:
if blocs[i] + blengths[i] > self.length:
raise ValueError(f'Block {i} extends beyond end')
# no zero-length blocks
if blengths[i] == 0:
raise ValueError(f'Zero-length block {i}')
def equals(self, other: object) -> bool:
if not isinstance(other, BlockIndex):
return False
if self is other:
return True
same_length = self.length == other.length
same_blocks = (np.array_equal(self.blocs, other.blocs) and
np.array_equal(self.blengths, other.blengths))
return same_length and same_blocks
def to_block_index(self):
return self
cpdef to_int_index(self):
cdef:
int32_t i = 0, j, b
int32_t offset
ndarray[int32_t, ndim=1] indices
indices = np.empty(self.npoints, dtype=np.int32)
for b in range(self.nblocks):
offset = self.locbuf[b]
for j in range(self.lenbuf[b]):
indices[i] = offset + j
i += 1
return IntIndex(self.length, indices)
@property
def indices(self):
return self.to_int_index().indices
cpdef BlockIndex intersect(self, SparseIndex other):
"""
Intersect two BlockIndex objects
Returns
-------
BlockIndex
"""
cdef:
BlockIndex y
ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen, out_bloc, out_blen
Py_ssize_t xi = 0, yi = 0, max_len, result_indexer = 0
int32_t cur_loc, cur_length, diff
y = other.to_block_index()
if self.length != y.length:
raise Exception('Indices must reference same underlying length')
xloc = self.blocs
xlen = self.blengths
yloc = y.blocs
ylen = y.blengths
# block may be split, but can't exceed original len / 2 + 1
max_len = min(self.length, y.length) // 2 + 1
out_bloc = np.empty(max_len, dtype=np.int32)
out_blen = np.empty(max_len, dtype=np.int32)
while True:
# we are done (or possibly never began)
if xi >= self.nblocks or yi >= y.nblocks:
break
# completely symmetric...would like to avoid code dup but oh well
if xloc[xi] >= yloc[yi]:
cur_loc = xloc[xi]
diff = xloc[xi] - yloc[yi]
if ylen[yi] <= diff:
# have to skip this block
yi += 1
continue
if ylen[yi] - diff < xlen[xi]:
# take end of y block, move onward
cur_length = ylen[yi] - diff
yi += 1
else:
# take end of x block
cur_length = xlen[xi]
xi += 1
else: # xloc[xi] < yloc[yi]
cur_loc = yloc[yi]
diff = yloc[yi] - xloc[xi]
if xlen[xi] <= diff:
# have to skip this block
xi += 1
continue
if xlen[xi] - diff < ylen[yi]:
# take end of x block, move onward
cur_length = xlen[xi] - diff
xi += 1
else:
# take end of y block
cur_length = ylen[yi]
yi += 1
out_bloc[result_indexer] = cur_loc
out_blen[result_indexer] = cur_length
result_indexer += 1
out_bloc = out_bloc[:result_indexer]
out_blen = out_blen[:result_indexer]
return BlockIndex(self.length, out_bloc, out_blen)
cpdef BlockIndex make_union(self, SparseIndex y):
"""
Combine together two BlockIndex objects, accepting indices if contained
in one or the other
Parameters
----------
other : SparseIndex
Notes
-----
union is a protected keyword in Cython, hence make_union
Returns
-------
BlockIndex
"""
return BlockUnion(self, y.to_block_index()).result
cpdef Py_ssize_t lookup(self, Py_ssize_t index):
"""
Return the internal location if value exists on given index.
Return -1 otherwise.
"""
cdef:
Py_ssize_t i, cum_len
ndarray[int32_t, ndim=1] locs, lens
locs = self.blocs
lens = self.blengths
if self.nblocks == 0:
return -1
elif index < locs[0]:
return -1
cum_len = 0
for i in range(self.nblocks):
if index >= locs[i] and index < locs[i] + lens[i]:
return cum_len + index - locs[i]
cum_len += lens[i]
return -1
@cython.wraparound(False)
cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
"""
Vectorized lookup, returns ndarray[int32_t]
"""
cdef:
Py_ssize_t n, i, j, ind_val
ndarray[int32_t, ndim=1] locs, lens
ndarray[int32_t, ndim=1] results
locs = self.blocs
lens = self.blengths
n = len(indexer)
results = np.empty(n, dtype=np.int32)
results[:] = -1
if self.npoints == 0:
return results
for i in range(n):
ind_val = indexer[i]
if not (ind_val < 0 or self.length <= ind_val):
cum_len = 0
for j in range(self.nblocks):
if ind_val >= locs[j] and ind_val < locs[j] + lens[j]:
results[i] = cum_len + ind_val - locs[j]
cum_len += lens[j]
return results
@cython.internal
cdef class BlockMerge:
"""
Object-oriented approach makes sharing state between recursive functions a
lot easier and reduces code duplication
"""
cdef:
BlockIndex x, y, result
ndarray xstart, xlen, xend, ystart, ylen, yend
int32_t xi, yi # block indices
def __init__(self, BlockIndex x, BlockIndex y):
self.x = x
self.y = y
if x.length != y.length:
raise Exception('Indices must reference same underlying length')
self.xstart = self.x.blocs
self.ystart = self.y.blocs
self.xend = self.x.blocs + self.x.blengths
self.yend = self.y.blocs + self.y.blengths
# self.xlen = self.x.blengths
# self.ylen = self.y.blengths
self.xi = 0
self.yi = 0
self.result = self._make_merged_blocks()
cdef _make_merged_blocks(self):
raise NotImplementedError
cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode):
if mode == 0:
self.xi = xi
self.yi = yi
else:
self.xi = yi
self.yi = xi
@cython.internal
cdef class BlockUnion(BlockMerge):
"""
Object-oriented approach makes sharing state between recursive functions a
lot easier and reduces code duplication
"""
cdef _make_merged_blocks(self):
cdef:
ndarray[int32_t, ndim=1] xstart, xend, ystart
ndarray[int32_t, ndim=1] yend, out_bloc, out_blen
int32_t nstart, nend, diff
Py_ssize_t max_len, result_indexer = 0
xstart = self.xstart
xend = self.xend
ystart = self.ystart
yend = self.yend
max_len = min(self.x.length, self.y.length) // 2 + 1
out_bloc = np.empty(max_len, dtype=np.int32)
out_blen = np.empty(max_len, dtype=np.int32)
while True:
# we are done (or possibly never began)
if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks:
break
elif self.yi >= self.y.nblocks:
# through with y, just pass through x blocks
nstart = xstart[self.xi]
nend = xend[self.xi]
self.xi += 1
elif self.xi >= self.x.nblocks:
# through with x, just pass through y blocks
nstart = ystart[self.yi]
nend = yend[self.yi]
self.yi += 1
else:
# find end of new block
if xstart[self.xi] < ystart[self.yi]:
nstart = xstart[self.xi]
nend = self._find_next_block_end(0)
else:
nstart = ystart[self.yi]
nend = self._find_next_block_end(1)
out_bloc[result_indexer] = nstart
out_blen[result_indexer] = nend - nstart
result_indexer += 1
out_bloc = out_bloc[:result_indexer]
out_blen = out_blen[:result_indexer]
return BlockIndex(self.x.length, out_bloc, out_blen)
cdef int32_t _find_next_block_end(self, bint mode) except -1:
"""
Wow, this got complicated in a hurry
mode 0: block started in index x
mode 1: block started in index y
"""
cdef:
ndarray[int32_t, ndim=1] xstart, xend, ystart, yend
int32_t xi, yi, xnblocks, ynblocks, nend
if mode != 0 and mode != 1:
raise Exception('Mode must be 0 or 1')
# so symmetric code will work
if mode == 0:
xstart = self.xstart
xend = self.xend
xi = self.xi
ystart = self.ystart
yend = self.yend
yi = self.yi
ynblocks = self.y.nblocks
else:
xstart = self.ystart
xend = self.yend
xi = self.yi
ystart = self.xstart
yend = self.xend
yi = self.xi
ynblocks = self.x.nblocks
nend = xend[xi]
# done with y?
if yi == ynblocks:
self._set_current_indices(xi + 1, yi, mode)
return nend
elif nend < ystart[yi]:
# block ends before y block
self._set_current_indices(xi + 1, yi, mode)
return nend
else:
while yi < ynblocks and nend > yend[yi]:
yi += 1
self._set_current_indices(xi + 1, yi, mode)
if yi == ynblocks:
return nend
if nend < ystart[yi]:
# we're done, return the block end
return nend
else:
# merge blocks, continue searching
# this also catches the case where blocks
return self._find_next_block_end(1 - mode)
# -----------------------------------------------------------------------------
# Sparse arithmetic
include "sparse_op_helper.pxi"
# -----------------------------------------------------------------------------
# SparseArray mask create operations
def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
cdef:
object value
Py_ssize_t i
Py_ssize_t new_length = len(arr)
ndarray[int8_t, ndim=1] mask
mask = np.ones(new_length, dtype=np.int8)
for i in range(new_length):
value = arr[i]
if value == fill_value and type(value) == type(fill_value):
mask[i] = 0
return mask.view(dtype=bool)

View File

@@ -0,0 +1,309 @@
"""
Template for each `dtype` helper function for sparse ops
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
# ----------------------------------------------------------------------
# Sparse op
# ----------------------------------------------------------------------
ctypedef fused sparse_t:
float64_t
int64_t
cdef inline float64_t __div__(sparse_t a, sparse_t b):
if b == 0:
if a > 0:
return INF
elif a < 0:
return -INF
else:
return NaN
else:
return float(a) / b
cdef inline float64_t __truediv__(sparse_t a, sparse_t b):
return __div__(a, b)
cdef inline sparse_t __mod__(sparse_t a, sparse_t b):
if b == 0:
if sparse_t is float64_t:
return NaN
else:
return 0
else:
return a % b
cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b):
if b == 0:
if sparse_t is float64_t:
# Match non-sparse Series behavior implemented in mask_zero_div_zero
if a > 0:
return INF
elif a < 0:
return -INF
return NaN
else:
return 0
else:
return a // b
# ----------------------------------------------------------------------
# sparse array op
# ----------------------------------------------------------------------
{{py:
# dtype, arith_comp_group, logical_group
dtypes = [('float64', True, False),
('int64', True, True),
('uint8', False, True)]
# do not generate arithmetic / comparison template for uint8,
# it should be done in fused types
def get_op(tup):
assert isinstance(tup, tuple)
assert len(tup) == 4
opname, lval, rval, dtype = tup
ops_dict = {'add': '{0} + {1}',
'sub': '{0} - {1}',
'mul': '{0} * {1}',
'div': '__div__({0}, {1})',
'mod': '__mod__({0}, {1})',
'truediv': '__truediv__({0}, {1})',
'floordiv': '__floordiv__({0}, {1})',
'pow': '{0} ** {1}',
'eq': '{0} == {1}',
'ne': '{0} != {1}',
'lt': '{0} < {1}',
'gt': '{0} > {1}',
'le': '{0} <= {1}',
'ge': '{0} >= {1}',
'and': '{0} & {1}', # logical op
'or': '{0} | {1}',
'xor': '{0} ^ {1}'}
return ops_dict[opname].format(lval, rval)
def get_dispatch(dtypes):
ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
'floordiv', 'pow',
'eq', 'ne', 'lt', 'gt', 'le', 'ge',
'and', 'or', 'xor']
for opname in ops_list:
for dtype, arith_comp_group, logical_group in dtypes:
if opname in ('div', 'truediv'):
rdtype = 'float64'
elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
# comparison op
rdtype = 'uint8'
elif opname in ('and', 'or', 'xor'):
# logical op
rdtype = 'uint8'
else:
rdtype = dtype
if opname in ('and', 'or', 'xor'):
if logical_group:
yield opname, dtype, rdtype
else:
if arith_comp_group:
yield opname, dtype, rdtype
}}
{{for opname, dtype, rdtype in get_dispatch(dtypes)}}
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
BlockIndex xindex,
{{dtype}}_t xfill,
{{dtype}}_t[:] y_,
BlockIndex yindex,
{{dtype}}_t yfill):
'''
Binary operator on BlockIndex objects with fill values
'''
cdef:
BlockIndex out_index
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
int32_t xbp = 0, ybp = 0 # block positions
int32_t xloc, yloc
Py_ssize_t xblock = 0, yblock = 0 # block numbers
{{dtype}}_t[:] x, y
ndarray[{{rdtype}}_t, ndim=1] out
# to suppress Cython warning
x = x_
y = y_
out_index = xindex.make_union(yindex)
out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
# Wow, what a hack job. Need to do something about this
# walk the two SparseVectors, adding matched locations...
for out_i in range(out_index.npoints):
if yblock == yindex.nblocks:
# use y fill value
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
xi += 1
# advance x location
xbp += 1
if xbp == xindex.lenbuf[xblock]:
xblock += 1
xbp = 0
continue
if xblock == xindex.nblocks:
# use x fill value
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
yi += 1
# advance y location
ybp += 1
if ybp == yindex.lenbuf[yblock]:
yblock += 1
ybp = 0
continue
yloc = yindex.locbuf[yblock] + ybp
xloc = xindex.locbuf[xblock] + xbp
# each index in the out_index had to come from either x, y, or both
if xloc == yloc:
out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
xi += 1
yi += 1
# advance both locations
xbp += 1
if xbp == xindex.lenbuf[xblock]:
xblock += 1
xbp = 0
ybp += 1
if ybp == yindex.lenbuf[yblock]:
yblock += 1
ybp = 0
elif xloc < yloc:
# use y fill value
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
xi += 1
# advance x location
xbp += 1
if xbp == xindex.lenbuf[xblock]:
xblock += 1
xbp = 0
else:
# use x fill value
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
yi += 1
# advance y location
ybp += 1
if ybp == yindex.lenbuf[yblock]:
yblock += 1
ybp = 0
return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
@cython.wraparound(False)
@cython.boundscheck(False)
cdef inline tuple int_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
IntIndex xindex,
{{dtype}}_t xfill,
{{dtype}}_t[:] y_,
IntIndex yindex,
{{dtype}}_t yfill):
cdef:
IntIndex out_index
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
int32_t xloc, yloc
int32_t[:] xindices, yindices, out_indices
{{dtype}}_t[:] x, y
ndarray[{{rdtype}}_t, ndim=1] out
# suppress Cython compiler warnings due to inlining
x = x_
y = y_
# need to do this first to know size of result array
out_index = xindex.make_union(yindex)
out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
xindices = xindex.indices
yindices = yindex.indices
out_indices = out_index.indices
# walk the two SparseVectors, adding matched locations...
for out_i in range(out_index.npoints):
if xi == xindex.npoints:
# use x fill value
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
yi += 1
continue
if yi == yindex.npoints:
# use y fill value
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
xi += 1
continue
xloc = xindices[xi]
yloc = yindices[yi]
# each index in the out_index had to come from either x, y, or both
if xloc == yloc:
out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
xi += 1
yi += 1
elif xloc < yloc:
# use y fill value
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
xi += 1
else:
# use x fill value
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
yi += 1
return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
cpdef sparse_{{opname}}_{{dtype}}({{dtype}}_t[:] x,
SparseIndex xindex, {{dtype}}_t xfill,
{{dtype}}_t[:] y,
SparseIndex yindex, {{dtype}}_t yfill):
if isinstance(xindex, BlockIndex):
return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill,
y, yindex.to_block_index(), yfill)
elif isinstance(xindex, IntIndex):
return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill,
y, yindex.to_int_index(), yfill)
else:
raise NotImplementedError
{{endfor}}

View File

@@ -0,0 +1,48 @@
#ifndef _PANDAS_MATH_H_
#define _PANDAS_MATH_H_
// MSVC 2017 has a bug where `x == x` can be true for NaNs.
// MSC_VER from https://stackoverflow.com/a/70630/1889400
// Place upper bound on this check once a fixed MSVC is released.
#if defined(_MSC_VER) && (_MSC_VER < 1800)
#include <cmath>
// In older versions of Visual Studio there wasn't a std::signbit defined
// This defines it using _copysign
namespace std {
__inline int isnan(double x) { return _isnan(x); }
__inline int signbit(double num) { return _copysign(1.0, num) < 0; }
__inline int notnan(double x) { return !isnan(x); }
}
#elif defined(_MSC_VER) && (_MSC_VER >= 1900)
#include <cmath>
namespace std {
__inline int isnan(double x) { return _isnan(x); }
__inline int notnan(double x) { return !isnan(x); }
}
#elif defined(_MSC_VER)
#include <cmath>
namespace std {
__inline int isnan(double x) { return _isnan(x); }
__inline int notnan(double x) { return x == x; }
}
#elif defined(__MVS__)
#include <cmath>
#define _signbit signbit
#undef signbit
#undef isnan
namespace std {
__inline int notnan(double x) { return x == x; }
__inline int signbit(double num) { return _signbit(num); }
__inline int isnan(double x) { return isnan(x); }
}
#else
#include <cmath>
namespace std {
__inline int notnan(double x) { return x == x; }
}
#endif
#endif

View File

@@ -0,0 +1,305 @@
// ISO C9x compliant inttypes.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The name of the author may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_INTTYPES_H_ // [
#define _MSC_INTTYPES_H_
#if _MSC_VER > 1000
#pragma once
#endif
#include "ms_stdint.h"
// 7.8 Format conversion of integer types
typedef struct {
intmax_t quot;
intmax_t rem;
} imaxdiv_t;
// 7.8.1 Macros for format specifiers
#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198
// The fprintf macros for signed integers are:
#define PRId8 "d"
#define PRIi8 "i"
#define PRIdLEAST8 "d"
#define PRIiLEAST8 "i"
#define PRIdFAST8 "d"
#define PRIiFAST8 "i"
#define PRId16 "hd"
#define PRIi16 "hi"
#define PRIdLEAST16 "hd"
#define PRIiLEAST16 "hi"
#define PRIdFAST16 "hd"
#define PRIiFAST16 "hi"
#define PRId32 "I32d"
#define PRIi32 "I32i"
#define PRIdLEAST32 "I32d"
#define PRIiLEAST32 "I32i"
#define PRIdFAST32 "I32d"
#define PRIiFAST32 "I32i"
#define PRId64 "I64d"
#define PRIi64 "I64i"
#define PRIdLEAST64 "I64d"
#define PRIiLEAST64 "I64i"
#define PRIdFAST64 "I64d"
#define PRIiFAST64 "I64i"
#define PRIdMAX "I64d"
#define PRIiMAX "I64i"
#define PRIdPTR "Id"
#define PRIiPTR "Ii"
// The fprintf macros for unsigned integers are:
#define PRIo8 "o"
#define PRIu8 "u"
#define PRIx8 "x"
#define PRIX8 "X"
#define PRIoLEAST8 "o"
#define PRIuLEAST8 "u"
#define PRIxLEAST8 "x"
#define PRIXLEAST8 "X"
#define PRIoFAST8 "o"
#define PRIuFAST8 "u"
#define PRIxFAST8 "x"
#define PRIXFAST8 "X"
#define PRIo16 "ho"
#define PRIu16 "hu"
#define PRIx16 "hx"
#define PRIX16 "hX"
#define PRIoLEAST16 "ho"
#define PRIuLEAST16 "hu"
#define PRIxLEAST16 "hx"
#define PRIXLEAST16 "hX"
#define PRIoFAST16 "ho"
#define PRIuFAST16 "hu"
#define PRIxFAST16 "hx"
#define PRIXFAST16 "hX"
#define PRIo32 "I32o"
#define PRIu32 "I32u"
#define PRIx32 "I32x"
#define PRIX32 "I32X"
#define PRIoLEAST32 "I32o"
#define PRIuLEAST32 "I32u"
#define PRIxLEAST32 "I32x"
#define PRIXLEAST32 "I32X"
#define PRIoFAST32 "I32o"
#define PRIuFAST32 "I32u"
#define PRIxFAST32 "I32x"
#define PRIXFAST32 "I32X"
#define PRIo64 "I64o"
#define PRIu64 "I64u"
#define PRIx64 "I64x"
#define PRIX64 "I64X"
#define PRIoLEAST64 "I64o"
#define PRIuLEAST64 "I64u"
#define PRIxLEAST64 "I64x"
#define PRIXLEAST64 "I64X"
#define PRIoFAST64 "I64o"
#define PRIuFAST64 "I64u"
#define PRIxFAST64 "I64x"
#define PRIXFAST64 "I64X"
#define PRIoMAX "I64o"
#define PRIuMAX "I64u"
#define PRIxMAX "I64x"
#define PRIXMAX "I64X"
#define PRIoPTR "Io"
#define PRIuPTR "Iu"
#define PRIxPTR "Ix"
#define PRIXPTR "IX"
// The fscanf macros for signed integers are:
#define SCNd8 "d"
#define SCNi8 "i"
#define SCNdLEAST8 "d"
#define SCNiLEAST8 "i"
#define SCNdFAST8 "d"
#define SCNiFAST8 "i"
#define SCNd16 "hd"
#define SCNi16 "hi"
#define SCNdLEAST16 "hd"
#define SCNiLEAST16 "hi"
#define SCNdFAST16 "hd"
#define SCNiFAST16 "hi"
#define SCNd32 "ld"
#define SCNi32 "li"
#define SCNdLEAST32 "ld"
#define SCNiLEAST32 "li"
#define SCNdFAST32 "ld"
#define SCNiFAST32 "li"
#define SCNd64 "I64d"
#define SCNi64 "I64i"
#define SCNdLEAST64 "I64d"
#define SCNiLEAST64 "I64i"
#define SCNdFAST64 "I64d"
#define SCNiFAST64 "I64i"
#define SCNdMAX "I64d"
#define SCNiMAX "I64i"
#ifdef _WIN64 // [
# define SCNdPTR "I64d"
# define SCNiPTR "I64i"
#else // _WIN64 ][
# define SCNdPTR "ld"
# define SCNiPTR "li"
#endif // _WIN64 ]
// The fscanf macros for unsigned integers are:
#define SCNo8 "o"
#define SCNu8 "u"
#define SCNx8 "x"
#define SCNX8 "X"
#define SCNoLEAST8 "o"
#define SCNuLEAST8 "u"
#define SCNxLEAST8 "x"
#define SCNXLEAST8 "X"
#define SCNoFAST8 "o"
#define SCNuFAST8 "u"
#define SCNxFAST8 "x"
#define SCNXFAST8 "X"
#define SCNo16 "ho"
#define SCNu16 "hu"
#define SCNx16 "hx"
#define SCNX16 "hX"
#define SCNoLEAST16 "ho"
#define SCNuLEAST16 "hu"
#define SCNxLEAST16 "hx"
#define SCNXLEAST16 "hX"
#define SCNoFAST16 "ho"
#define SCNuFAST16 "hu"
#define SCNxFAST16 "hx"
#define SCNXFAST16 "hX"
#define SCNo32 "lo"
#define SCNu32 "lu"
#define SCNx32 "lx"
#define SCNX32 "lX"
#define SCNoLEAST32 "lo"
#define SCNuLEAST32 "lu"
#define SCNxLEAST32 "lx"
#define SCNXLEAST32 "lX"
#define SCNoFAST32 "lo"
#define SCNuFAST32 "lu"
#define SCNxFAST32 "lx"
#define SCNXFAST32 "lX"
#define SCNo64 "I64o"
#define SCNu64 "I64u"
#define SCNx64 "I64x"
#define SCNX64 "I64X"
#define SCNoLEAST64 "I64o"
#define SCNuLEAST64 "I64u"
#define SCNxLEAST64 "I64x"
#define SCNXLEAST64 "I64X"
#define SCNoFAST64 "I64o"
#define SCNuFAST64 "I64u"
#define SCNxFAST64 "I64x"
#define SCNXFAST64 "I64X"
#define SCNoMAX "I64o"
#define SCNuMAX "I64u"
#define SCNxMAX "I64x"
#define SCNXMAX "I64X"
#ifdef _WIN64 // [
# define SCNoPTR "I64o"
# define SCNuPTR "I64u"
# define SCNxPTR "I64x"
# define SCNXPTR "I64X"
#else // _WIN64 ][
# define SCNoPTR "lo"
# define SCNuPTR "lu"
# define SCNxPTR "lx"
# define SCNXPTR "lX"
#endif // _WIN64 ]
#endif // __STDC_FORMAT_MACROS ]
// 7.8.2 Functions for greatest-width integer types
// 7.8.2.1 The imaxabs function
#define imaxabs _abs64
// 7.8.2.2 The imaxdiv function
// This is modified version of div() function from Microsoft's div.c found
// in %MSVC.NET%\crt\src\div.c
#ifdef STATIC_IMAXDIV // [
static
#else // STATIC_IMAXDIV ][
_inline
#endif // STATIC_IMAXDIV ]
imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
{
imaxdiv_t result;
result.quot = numer / denom;
result.rem = numer % denom;
if (numer < 0 && result.rem > 0) {
// did division wrong; must fix up
++result.quot;
result.rem -= denom;
}
return result;
}
// 7.8.2.3 The strtoimax and strtoumax functions
#define strtoimax _strtoi64
#define strtoumax _strtoui64
// 7.8.2.4 The wcstoimax and wcstoumax functions
#define wcstoimax _wcstoi64
#define wcstoumax _wcstoui64
#endif // _MSC_INTTYPES_H_ ]

View File

@@ -0,0 +1,247 @@
// ISO C9x compliant stdint.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006-2008 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The name of the author may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_STDINT_H_ // [
#define _MSC_STDINT_H_
#if _MSC_VER > 1000
#pragma once
#endif
#include <limits.h>
// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
// or compiler give many errors like this:
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
#ifdef __cplusplus
extern "C" {
#endif
# include <wchar.h>
#ifdef __cplusplus
}
#endif
// Define _W64 macros to mark types changing their size, like intptr_t.
#ifndef _W64
# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
# define _W64 __w64
# else
# define _W64
# endif
#endif
// 7.18.1 Integer types
// 7.18.1.1 Exact-width integer types
// Visual Studio 6 and Embedded Visual C++ 4 doesn't
// realize that, e.g. char has the same size as __int8
// so we give up on __intX for them.
#if (_MSC_VER < 1300)
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
#else
typedef signed __int8 int8_t;
typedef signed __int16 int16_t;
typedef signed __int32 int32_t;
typedef unsigned __int8 uint8_t;
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
#endif
typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
// 7.18.1.2 Minimum-width integer types
typedef int8_t int_least8_t;
typedef int16_t int_least16_t;
typedef int32_t int_least32_t;
typedef int64_t int_least64_t;
typedef uint8_t uint_least8_t;
typedef uint16_t uint_least16_t;
typedef uint32_t uint_least32_t;
typedef uint64_t uint_least64_t;
// 7.18.1.3 Fastest minimum-width integer types
typedef int8_t int_fast8_t;
typedef int16_t int_fast16_t;
typedef int32_t int_fast32_t;
typedef int64_t int_fast64_t;
typedef uint8_t uint_fast8_t;
typedef uint16_t uint_fast16_t;
typedef uint32_t uint_fast32_t;
typedef uint64_t uint_fast64_t;
// 7.18.1.4 Integer types capable of holding object pointers
#ifdef _WIN64 // [
typedef signed __int64 intptr_t;
typedef unsigned __int64 uintptr_t;
#else // _WIN64 ][
typedef _W64 signed int intptr_t;
typedef _W64 unsigned int uintptr_t;
#endif // _WIN64 ]
// 7.18.1.5 Greatest-width integer types
typedef int64_t intmax_t;
typedef uint64_t uintmax_t;
// 7.18.2 Limits of specified-width integer types
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
// 7.18.2.1 Limits of exact-width integer types
#define INT8_MIN ((int8_t)_I8_MIN)
#define INT8_MAX _I8_MAX
#define INT16_MIN ((int16_t)_I16_MIN)
#define INT16_MAX _I16_MAX
#define INT32_MIN ((int32_t)_I32_MIN)
#define INT32_MAX _I32_MAX
#define INT64_MIN ((int64_t)_I64_MIN)
#define INT64_MAX _I64_MAX
#define UINT8_MAX _UI8_MAX
#define UINT16_MAX _UI16_MAX
#define UINT32_MAX _UI32_MAX
#define UINT64_MAX _UI64_MAX
// 7.18.2.2 Limits of minimum-width integer types
#define INT_LEAST8_MIN INT8_MIN
#define INT_LEAST8_MAX INT8_MAX
#define INT_LEAST16_MIN INT16_MIN
#define INT_LEAST16_MAX INT16_MAX
#define INT_LEAST32_MIN INT32_MIN
#define INT_LEAST32_MAX INT32_MAX
#define INT_LEAST64_MIN INT64_MIN
#define INT_LEAST64_MAX INT64_MAX
#define UINT_LEAST8_MAX UINT8_MAX
#define UINT_LEAST16_MAX UINT16_MAX
#define UINT_LEAST32_MAX UINT32_MAX
#define UINT_LEAST64_MAX UINT64_MAX
// 7.18.2.3 Limits of fastest minimum-width integer types
#define INT_FAST8_MIN INT8_MIN
#define INT_FAST8_MAX INT8_MAX
#define INT_FAST16_MIN INT16_MIN
#define INT_FAST16_MAX INT16_MAX
#define INT_FAST32_MIN INT32_MIN
#define INT_FAST32_MAX INT32_MAX
#define INT_FAST64_MIN INT64_MIN
#define INT_FAST64_MAX INT64_MAX
#define UINT_FAST8_MAX UINT8_MAX
#define UINT_FAST16_MAX UINT16_MAX
#define UINT_FAST32_MAX UINT32_MAX
#define UINT_FAST64_MAX UINT64_MAX
// 7.18.2.4 Limits of integer types capable of holding object pointers
#ifdef _WIN64 // [
# define INTPTR_MIN INT64_MIN
# define INTPTR_MAX INT64_MAX
# define UINTPTR_MAX UINT64_MAX
#else // _WIN64 ][
# define INTPTR_MIN INT32_MIN
# define INTPTR_MAX INT32_MAX
# define UINTPTR_MAX UINT32_MAX
#endif // _WIN64 ]
// 7.18.2.5 Limits of greatest-width integer types
#define INTMAX_MIN INT64_MIN
#define INTMAX_MAX INT64_MAX
#define UINTMAX_MAX UINT64_MAX
// 7.18.3 Limits of other integer types
#ifdef _WIN64 // [
# define PTRDIFF_MIN _I64_MIN
# define PTRDIFF_MAX _I64_MAX
#else // _WIN64 ][
# define PTRDIFF_MIN _I32_MIN
# define PTRDIFF_MAX _I32_MAX
#endif // _WIN64 ]
#define SIG_ATOMIC_MIN INT_MIN
#define SIG_ATOMIC_MAX INT_MAX
#ifndef SIZE_MAX // [
# ifdef _WIN64 // [
# define SIZE_MAX _UI64_MAX
# else // _WIN64 ][
# define SIZE_MAX _UI32_MAX
# endif // _WIN64 ]
#endif // SIZE_MAX ]
// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
#ifndef WCHAR_MIN // [
# define WCHAR_MIN 0
#endif // WCHAR_MIN ]
#ifndef WCHAR_MAX // [
# define WCHAR_MAX _UI16_MAX
#endif // WCHAR_MAX ]
#define WINT_MIN 0
#define WINT_MAX _UI16_MAX
#endif // __STDC_LIMIT_MACROS ]
// 7.18.4 Limits of other integer types
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
// 7.18.4.1 Macros for minimum-width integer constants
#define INT8_C(val) val##i8
#define INT16_C(val) val##i16
#define INT32_C(val) val##i32
#define INT64_C(val) val##i64
#define UINT8_C(val) val##ui8
#define UINT16_C(val) val##ui16
#define UINT32_C(val) val##ui32
#define UINT64_C(val) val##ui64
// 7.18.4.2 Macros for greatest-width integer constants
#define INTMAX_C INT64_C
#define UINTMAX_C UINT64_C
#endif // __STDC_CONSTANT_MACROS ]
#endif // _MSC_STDINT_H_ ]

View File

@@ -0,0 +1,16 @@
#ifndef _PANDAS_PORTABLE_H_
#define _PANDAS_PORTABLE_H_
#if defined(_MSC_VER)
#define strcasecmp( s1, s2 ) _stricmp( s1, s2 )
#endif
// GH-23516 - works around locale perf issues
// from MUSL libc, MIT Licensed - see LICENSES
#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
#endif

View File

@@ -0,0 +1,10 @@
#ifndef _PANDAS_STDINT_H_
#define _PANDAS_STDINT_H_
#if defined(_MSC_VER) && (_MSC_VER < 1900)
#include "ms_stdint.h"
#else
#include <stdint.h>
#endif
#endif

View File

@@ -0,0 +1,27 @@
/*
Copyright (c) 2016, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
#ifndef PANDAS__LIBS_SRC_INLINE_HELPER_H_
#define PANDAS__LIBS_SRC_INLINE_HELPER_H_
#ifndef PANDAS_INLINE
#if defined(__clang__)
#define PANDAS_INLINE static __inline__ __attribute__ ((__unused__))
#elif defined(__GNUC__)
#define PANDAS_INLINE static __inline__
#elif defined(_MSC_VER)
#define PANDAS_INLINE static __inline
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define PANDAS_INLINE static inline
#else
#define PANDAS_INLINE
#endif // __GNUC__
#endif // PANDAS_INLINE
#endif // PANDAS__LIBS_SRC_INLINE_HELPER_H_

View File

@@ -0,0 +1,719 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "khash.h"
KHASH_MAP_INIT_INT(32, char)
int main() {
int ret, is_missing;
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
if (!ret) kh_del(32, h, k);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
k = kh_get(32, h, 5);
kh_del(32, h, k);
for (k = kh_begin(h); k != kh_end(h); ++k)
if (kh_exist(h, k)) kh_value(h, k) = 1;
kh_destroy(32, h);
return 0;
}
*/
/*
2011-09-16 (0.2.6):
* The capacity is a power of 2. This seems to dramatically improve the
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- https://github.com/stefanocasazza/ULib
- https://nothings.org/computer/judy/
* Allow to optionally use linear probing which usually has better
performance for random input. Double hashing is still the default as it
is more robust to certain non-random input.
* Added Wang's integer hash function (not used by default). This hash
function is more robust to certain non-random input.
2011-02-14 (0.2.5):
* Allow to declare global functions.
2009-09-26 (0.2.4):
* Improve portability
2008-09-19 (0.2.3):
* Corrected the example
* Improved interfaces
2008-09-11 (0.2.2):
* Improved speed a little in kh_put()
2008-09-10 (0.2.1):
* Added kh_clear()
* Fixed a compiling error
2008-09-02 (0.2.0):
* Changed to token concatenation which increases flexibility.
2008-08-31 (0.1.2):
* Fixed a bug in kh_get(), which has not been tested previously.
2008-08-31 (0.1.1):
* Added destructor
*/
#ifndef __AC_KHASH_H
#define __AC_KHASH_H
/*!
@header
Generic hash table library.
*/
#define AC_VERSION_KHASH_H "0.2.6"
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "../inline_helper.h"
// hooks for memory allocator, C-runtime allocator used per default
#ifndef KHASH_MALLOC
#define KHASH_MALLOC malloc
#endif
#ifndef KHASH_REALLOC
#define KHASH_REALLOC realloc
#endif
#ifndef KHASH_CALLOC
#define KHASH_CALLOC calloc
#endif
#ifndef KHASH_FREE
#define KHASH_FREE free
#endif
#if UINT_MAX == 0xffffffffu
typedef unsigned int khuint32_t;
typedef signed int khint32_t;
#elif ULONG_MAX == 0xffffffffu
typedef unsigned long khuint32_t;
typedef signed long khint32_t;
#endif
#if ULONG_MAX == ULLONG_MAX
typedef unsigned long khuint64_t;
typedef signed long khint64_t;
#else
typedef unsigned long long khuint64_t;
typedef signed long long khint64_t;
#endif
#if UINT_MAX == 0xffffu
typedef unsigned int khuint16_t;
typedef signed int khint16_t;
#elif USHRT_MAX == 0xffffu
typedef unsigned short khuint16_t;
typedef signed short khint16_t;
#endif
#if UCHAR_MAX == 0xffu
typedef unsigned char khuint8_t;
typedef signed char khint8_t;
#endif
typedef double khfloat64_t;
typedef float khfloat32_t;
typedef khuint32_t khuint_t;
typedef khuint_t khiter_t;
#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
#define __ac_isdel(flag, i) (0)
#define __ac_iseither(flag, i) __ac_isempty(flag, i)
#define __ac_set_isdel_false(flag, i) (0)
#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
#define __ac_set_isdel_true(flag, i) ((void)0)
// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){
const khuint32_t SEED = 0xc70f6907UL;
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const khuint32_t M_32 = 0x5bd1e995;
const int R_32 = 24;
// Initialize the hash to a 'random' value
khuint32_t h = SEED ^ 4;
//handle 4 bytes:
k *= M_32;
k ^= k >> R_32;
k *= M_32;
h *= M_32;
h ^= k;
// Do a few final mixes of the hash to ensure the "last few
// bytes" are well-incorporated. (Really needed here?)
h ^= h >> 13;
h *= M_32;
h ^= h >> 15;
return h;
}
// it is possible to have a special x64-version, which would need less operations, but
// using 32bit version always has also some benefits:
// - one code for 32bit and 64bit builds
// - the same case for 32bit and 64bit builds
// - no performance difference could be measured compared to a possible x64-version
khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){
const khuint32_t SEED = 0xc70f6907UL;
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const khuint32_t M_32 = 0x5bd1e995;
const int R_32 = 24;
// Initialize the hash to a 'random' value
khuint32_t h = SEED ^ 4;
//handle first 4 bytes:
k1 *= M_32;
k1 ^= k1 >> R_32;
k1 *= M_32;
h *= M_32;
h ^= k1;
//handle second 4 bytes:
k2 *= M_32;
k2 ^= k2 >> R_32;
k2 *= M_32;
h *= M_32;
h ^= k2;
// Do a few final mixes of the hash to ensure the "last few
// bytes" are well-incorporated.
h ^= h >> 13;
h *= M_32;
h ^= h >> 15;
return h;
}
khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){
khuint32_t k1 = (khuint32_t)k;
khuint32_t k2 = (khuint32_t)(k >> 32);
return murmur2_32_32to32(k1, k2);
}
#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
#endif
#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
static const double __ac_HASH_UPPER = 0.77;
#define KHASH_DECLARE(name, khkey_t, khval_t) \
typedef struct { \
khuint_t n_buckets, size, n_occupied, upper_bound; \
khuint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
extern kh_##name##_t *kh_init_##name(); \
extern void kh_destroy_##name(kh_##name##_t *h); \
extern void kh_clear_##name(kh_##name##_t *h); \
extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \
extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
extern void kh_del_##name(kh_##name##_t *h, khuint_t x);
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
typedef struct { \
khuint_t n_buckets, size, n_occupied, upper_bound; \
khuint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
SCOPE kh_##name##_t *kh_init_##name(void) { \
return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
} \
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
KHASH_FREE(h->keys); KHASH_FREE(h->flags); \
KHASH_FREE(h->vals); \
KHASH_FREE(h); \
} \
} \
SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khuint_t inc, k, i, last, mask; \
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
i = (i + inc) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khuint32_t *new_flags = 0; \
khuint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
if (h->n_buckets < new_n_buckets) { /* expand */ \
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
} /* otherwise shrink */ \
} \
} \
if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
khuint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isempty_true(h->flags, j); \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khuint_t inc, k, i; \
k = __hash_func(key); \
i = k & new_mask; \
inc = __ac_inc(k, new_mask); \
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
} \
} \
} \
} \
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
} \
KHASH_FREE(h->flags); /* free the working space */ \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
} \
SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khuint_t x; \
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
inc = __ac_inc(k, mask); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
i = (i + inc) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
else x = i; \
} \
} \
} \
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} \
SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@abstract Integer hash function
@param key The integer [khuint32_t]
@return The hash value [khuint_t]
*/
#define kh_int_hash_func(key) (khuint32_t)(key)
/*! @function
@abstract Integer comparison function
*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract 64-bit integer hash function
@param key The integer [khuint64_t]
@return The hash value [khuint_t]
*/
PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key)
{
return (khuint_t)((key)>>33^(key)^(key)<<11);
}
/*! @function
@abstract 64-bit integer comparison function
*/
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract const char* hash function
@param s Pointer to a null terminated string
@return The hash value
*/
PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s)
{
khuint_t h = *s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
return h;
}
/*! @function
@abstract Another interface to const char* hash function
@param key Pointer to a null terminated string [const char*]
@return The hash value [khuint_t]
*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function
@abstract Const char* comparison function
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key)
/* --- END OF HASH FUNCTIONS --- */
/* Other convenient macros... */
/*!
@abstract Type of the hash table.
@param name Name of the hash table [symbol]
*/
#define khash_t(name) kh_##name##_t
/*! @function
@abstract Initiate a hash table.
@param name Name of the hash table [symbol]
@return Pointer to the hash table [khash_t(name)*]
*/
#define kh_init(name) kh_init_##name(void)
/*! @function
@abstract Destroy a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_destroy(name, h) kh_destroy_##name(h)
/*! @function
@abstract Reset a hash table without deallocating memory.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_clear(name, h) kh_clear_##name(h)
/*! @function
@abstract Resize a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param s New size [khuint_t]
*/
#define kh_resize(name, h, s) kh_resize_##name(h, s)
/*! @function
@abstract Insert a key to the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@param r Extra return code: 0 if the key is present in the hash table;
1 if the bucket is empty (never used); 2 if the element in
the bucket has been deleted [int*]
@return Iterator to the inserted element [khuint_t]
*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
/*! @function
@abstract Retrieve a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t]
*/
#define kh_get(name, h, k) kh_get_##name(h, k)
/*! @function
@abstract Remove a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Iterator to the element to be deleted [khuint_t]
*/
#define kh_del(name, h, k) kh_del_##name(h, k)
/*! @function
@abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khuint_t]
@return 1 if containing data; 0 otherwise [int]
*/
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
/*! @function
@abstract Get key given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khuint_t]
@return Key [type of keys]
*/
#define kh_key(h, x) ((h)->keys[x])
/*! @function
@abstract Get value given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khuint_t]
@return Value [type of values]
@discussion For hash sets, calling this results in segfault.
*/
#define kh_val(h, x) ((h)->vals[x])
/*! @function
@abstract Alias of kh_val()
*/
#define kh_value(h, x) ((h)->vals[x])
/*! @function
@abstract Get the start iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The start iterator [khuint_t]
*/
#define kh_begin(h) (khuint_t)(0)
/*! @function
@abstract Get the end iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The end iterator [khuint_t]
*/
#define kh_end(h) ((h)->n_buckets)
/*! @function
@abstract Get the number of elements in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of elements in the hash table [khuint_t]
*/
#define kh_size(h) ((h)->size)
/*! @function
@abstract Get the number of buckets in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of buckets in the hash table [khuint_t]
*/
#define kh_n_buckets(h) ((h)->n_buckets)
/* More convenient interfaces */
/*! @function
@abstract Instantiate a hash set containing integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_MAP_INIT_UINT(name, khval_t) \
KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_UINT64(name) \
KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_UINT64(name, khval_t) \
KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 16bit-integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT16(name, khval_t) \
KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_MAP_INIT_UINT16(name, khval_t) \
KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 8bit-integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT8(name, khval_t) \
KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_MAP_INIT_UINT8(name, khval_t) \
KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_STR(name) \
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_STR(name, khval_t) \
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
#define kh_exist_str(h, k) (kh_exist(h, k))
#define kh_exist_float64(h, k) (kh_exist(h, k))
#define kh_exist_uint64(h, k) (kh_exist(h, k))
#define kh_exist_int64(h, k) (kh_exist(h, k))
#define kh_exist_float32(h, k) (kh_exist(h, k))
#define kh_exist_int32(h, k) (kh_exist(h, k))
#define kh_exist_uint32(h, k) (kh_exist(h, k))
#define kh_exist_int16(h, k) (kh_exist(h, k))
#define kh_exist_uint16(h, k) (kh_exist(h, k))
#define kh_exist_int8(h, k) (kh_exist(h, k))
#define kh_exist_uint8(h, k) (kh_exist(h, k))
KHASH_MAP_INIT_STR(str, size_t)
KHASH_MAP_INIT_INT(int32, size_t)
KHASH_MAP_INIT_UINT(uint32, size_t)
KHASH_MAP_INIT_INT64(int64, size_t)
KHASH_MAP_INIT_UINT64(uint64, size_t)
KHASH_MAP_INIT_INT16(int16, size_t)
KHASH_MAP_INIT_UINT16(uint16, size_t)
KHASH_MAP_INIT_INT8(int8, size_t)
KHASH_MAP_INIT_UINT8(uint8, size_t)
#endif /* __AC_KHASH_H */

View File

@@ -0,0 +1,446 @@
#include <string.h>
#include <Python.h>
// use numpy's definitions for complex
#include <numpy/arrayobject.h>
typedef npy_complex64 khcomplex64_t;
typedef npy_complex128 khcomplex128_t;
// khash should report usage to tracemalloc
#if PY_VERSION_HEX >= 0x03060000
#include <pymem.h>
#if PY_VERSION_HEX < 0x03070000
#define PyTraceMalloc_Track _PyTraceMalloc_Track
#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
#endif
#else
#define PyTraceMalloc_Track(...)
#define PyTraceMalloc_Untrack(...)
#endif
static const int KHASH_TRACE_DOMAIN = 424242;
void *traced_malloc(size_t size){
void * ptr = malloc(size);
if(ptr!=NULL){
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
}
return ptr;
}
void *traced_calloc(size_t num, size_t size){
void * ptr = calloc(num, size);
if(ptr!=NULL){
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size);
}
return ptr;
}
void *traced_realloc(void* old_ptr, size_t size){
void * ptr = realloc(old_ptr, size);
if(ptr!=NULL){
if(old_ptr != ptr){
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
}
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
}
return ptr;
}
void traced_free(void* ptr){
if(ptr!=NULL){
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
}
free(ptr);
}
#define KHASH_MALLOC traced_malloc
#define KHASH_REALLOC traced_realloc
#define KHASH_CALLOC traced_calloc
#define KHASH_FREE traced_free
#include "khash.h"
// Previously we were using the built in cpython hash function for doubles
// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
// is 64 bits the truncation causes collision issues. Given all that, we use our own
// simple hash, viewing the double bytes as an int64 and using khash's default
// hash for 64 bit integers.
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
// GH 28303 showed, that the simple xoring-version isn't good enough
// See GH 36729 for evaluation of the currently used murmur2-hash version
// An interesting alternative to expensive murmur2-hash would be to change
// the probing strategy and use e.g. the probing strategy from CPython's
// implementation of dicts, which shines for smaller sizes but is more
// predisposed to superlinear running times (see GH 36729 for comparison)
khuint64_t PANDAS_INLINE asuint64(double key) {
khuint64_t val;
memcpy(&val, &key, sizeof(double));
return val;
}
khuint32_t PANDAS_INLINE asuint32(float key) {
khuint32_t val;
memcpy(&val, &key, sizeof(float));
return val;
}
#define ZERO_HASH 0
#define NAN_HASH 0
khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){
// 0.0 and -0.0 should have the same hash:
if (val == 0.0){
return ZERO_HASH;
}
// all nans should have the same hash:
if ( val!=val ){
return NAN_HASH;
}
khuint64_t as_int = asuint64(val);
return murmur2_64to32(as_int);
}
khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){
// 0.0 and -0.0 should have the same hash:
if (val == 0.0f){
return ZERO_HASH;
}
// all nans should have the same hash:
if ( val!=val ){
return NAN_HASH;
}
khuint32_t as_int = asuint32(val);
return murmur2_32to32(as_int);
}
#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal)
KHASH_MAP_INIT_FLOAT64(float64, size_t)
#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal)
KHASH_MAP_INIT_FLOAT32(float32, size_t)
khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){
return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag);
}
khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){
return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag);
}
#define kh_complex_hash_equal(a, b) \
(kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag))
#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \
KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal)
KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal)
KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
#define kh_exist_complex64(h, k) (kh_exist(h, k))
#define kh_exist_complex128(h, k) (kh_exist(h, k))
// NaN-floats should be in the same equivalency class, see GH 22119
int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
return (
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b))
)
||
( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
}
// NaNs should be in the same equivalency class, see GH 41836
// PyObject_RichCompareBool for complexobjects has a different behavior
// needs to be replaced
int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
return (
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
a->cval.imag == b->cval.imag
)
||
(
a->cval.real == b->cval.real &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
a->cval.real == b->cval.real &&
a->cval.imag == b->cval.imag
);
}
int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);
// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
// which treats NaNs as equivalent
// see GH 41836
int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
Py_ssize_t i;
if (Py_SIZE(a) != Py_SIZE(b)) {
return 0;
}
for (i = 0; i < Py_SIZE(a); ++i) {
if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
return 0;
}
}
return 1;
}
int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
if (a == b) {
return 1;
}
if (Py_TYPE(a) == Py_TYPE(b)) {
// special handling for some built-in types which could have NaNs
// as we would like to have them equivalent, but the usual
// PyObject_RichCompareBool would return False
if (PyFloat_CheckExact(a)) {
return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
}
if (PyComplex_CheckExact(a)) {
return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
}
if (PyTuple_CheckExact(a)) {
return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
}
// frozenset isn't yet supported
}
int result = PyObject_RichCompareBool(a, b, Py_EQ);
if (result < 0) {
PyErr_Clear();
return 0;
}
return result;
}
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
//Since Python3.10, nan is no longer has hash 0
if (Py_IS_NAN(val)) {
return 0;
}
#if PY_VERSION_HEX < 0x030A0000
return _Py_HashDouble(val);
#else
return _Py_HashDouble(NULL, val);
#endif
}
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
}
#define _PandasHASH_IMAG 1000003UL
// replaces _Py_HashDouble with _Pandas_HashDouble
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
return -1;
}
Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
if (combined == (Py_uhash_t)-1) {
return -2;
}
return (Py_hash_t)combined;
}
khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
//we could use any hashing algorithm, this is the original CPython's for tuples
#if SIZEOF_PY_UHASH_T > 4
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
#else
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
#endif
Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
Py_ssize_t i, len = Py_SIZE(key);
PyObject **item = key->ob_item;
Py_uhash_t acc = _PandasHASH_XXPRIME_5;
for (i = 0; i < len; i++) {
Py_uhash_t lane = kh_python_hash_func(item[i]);
if (lane == (Py_uhash_t)-1) {
return -1;
}
acc += lane * _PandasHASH_XXPRIME_2;
acc = _PandasHASH_XXROTATE(acc);
acc *= _PandasHASH_XXPRIME_1;
}
/* Add input length, mangled to keep the historical value of hash(()). */
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
if (acc == (Py_uhash_t)-1) {
return 1546275796;
}
return acc;
}
khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
Py_hash_t hash;
// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
// yet for different nan-objects different hash-values
// are possible
if (PyFloat_CheckExact(key)) {
// we cannot use kh_float64_hash_func
// because float(k) == k holds for any int-object k
// and kh_float64_hash_func doesn't respect it
hash = floatobject_hash((PyFloatObject*)key);
}
else if (PyComplex_CheckExact(key)) {
// we cannot use kh_complex128_hash_func
// because complex(k,0) == k holds for any int-object k
// and kh_complex128_hash_func doesn't respect it
hash = complexobject_hash((PyComplexObject*)key);
}
else if (PyTuple_CheckExact(key)) {
hash = tupleobject_hash((PyTupleObject*)key);
}
else {
hash = PyObject_Hash(key);
}
if (hash == -1) {
PyErr_Clear();
return 0;
}
#if SIZEOF_PY_HASH_T == 4
// it is already 32bit value
return hash;
#else
// for 64bit builds,
// we need information of the upper 32bits as well
// see GH 37615
khuint64_t as_uint = (khuint64_t) hash;
// uints avoid undefined behavior of signed ints
return (as_uint>>32)^as_uint;
#endif
}
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
// Python object
typedef PyObject* kh_pyobject_t;
#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
kh_python_hash_func, kh_python_hash_equal)
KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
#define KHASH_SET_INIT_PYOBJECT(name) \
KHASH_INIT(name, kh_pyobject_t, char, 0, \
kh_python_hash_func, kh_python_hash_equal)
KHASH_SET_INIT_PYOBJECT(pyset)
#define kh_exist_pymap(h, k) (kh_exist(h, k))
#define kh_exist_pyset(h, k) (kh_exist(h, k))
KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
typedef struct {
kh_str_t *table;
int starts[256];
} kh_str_starts_t;
typedef kh_str_starts_t* p_kh_str_starts_t;
p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) {
kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
result->table = kh_init_str();
return result;
}
khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) {
khuint_t result = kh_put_str(table->table, key, ret);
if (*ret != 0) {
table->starts[(unsigned char)key[0]] = 1;
}
return result;
}
khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) {
unsigned char ch = *key;
if (table->starts[ch]) {
if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1;
}
return 0;
}
void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
kh_destroy_str(table->table);
KHASH_FREE(table);
}
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
kh_resize_str(table->table, val);
}
// utility function: given the number of elements
// returns number of necessary buckets
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
khuint_t candidate = n_elements;
kroundup32(candidate);
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
return (upper_bound < n_elements) ? 2*candidate : candidate;
}

View File

@@ -0,0 +1,100 @@
/*
Copyright (c) 2016, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
#ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_
#define PANDAS__LIBS_SRC_PARSE_HELPER_H_
#include <float.h>
#include "parser/tokenizer.h"
int to_double(char *item, double *p_value, char sci, char decimal,
int *maybe_int) {
char *p_end = NULL;
int error = 0;
/* Switch to precise xstrtod GH 31364 */
*p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1,
&error, maybe_int);
return (error == 0) && (!*p_end);
}
int floatify(PyObject *str, double *result, int *maybe_int) {
int status;
char *data;
PyObject *tmp = NULL;
const char sci = 'E';
const char dec = '.';
if (PyBytes_Check(str)) {
data = PyBytes_AS_STRING(str);
} else if (PyUnicode_Check(str)) {
tmp = PyUnicode_AsUTF8String(str);
if (tmp == NULL) {
return -1;
}
data = PyBytes_AS_STRING(tmp);
} else {
PyErr_SetString(PyExc_TypeError, "Invalid object type");
return -1;
}
status = to_double(data, result, sci, dec, maybe_int);
if (!status) {
/* handle inf/-inf infinity/-infinity */
if (strlen(data) == 3) {
if (0 == strcasecmp(data, "inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 4) {
if (0 == strcasecmp(data, "-inf")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcasecmp(data, "+inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 8) {
if (0 == strcasecmp(data, "infinity")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 9) {
if (0 == strcasecmp(data, "-infinity")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcasecmp(data, "+infinity")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else {
goto parsingerror;
}
}
Py_XDECREF(tmp);
return 0;
parsingerror:
PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
Py_XDECREF(tmp);
return -1;
}
#endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2016, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
#include "io.h"
/*
On-disk FILE, uncompressed
*/
void *new_rd_source(PyObject *obj) {
rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
if (rds == NULL) {
PyErr_NoMemory();
return NULL;
}
/* hold on to this object */
Py_INCREF(obj);
rds->obj = obj;
rds->buffer = NULL;
rds->position = 0;
return (void *)rds;
}
/*
Cleanup callbacks
*/
int del_rd_source(void *rds) {
Py_XDECREF(RDS(rds)->obj);
Py_XDECREF(RDS(rds)->buffer);
free(rds);
return 0;
}
/*
IO callbacks
*/
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors) {
PyGILState_STATE state;
PyObject *result, *func, *args, *tmp;
void *retval;
size_t length;
rd_source *src = RDS(source);
state = PyGILState_Ensure();
/* delete old object */
Py_XDECREF(src->buffer);
src->buffer = NULL;
args = Py_BuildValue("(i)", nbytes);
func = PyObject_GetAttrString(src->obj, "read");
/* TODO: does this release the GIL? */
result = PyObject_CallObject(func, args);
Py_XDECREF(args);
Py_XDECREF(func);
if (result == NULL) {
PyGILState_Release(state);
*bytes_read = 0;
*status = CALLING_READ_FAILED;
return NULL;
} else if (!PyBytes_Check(result)) {
tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
Py_DECREF(result);
if (tmp == NULL) {
PyGILState_Release(state);
return NULL;
}
result = tmp;
}
length = PySequence_Length(result);
if (length == 0)
*status = REACHED_EOF;
else
*status = 0;
/* hang on to the Python object */
src->buffer = result;
retval = (void *)PyBytes_AsString(result);
PyGILState_Release(state);
/* TODO: more error handling */
*bytes_read = length;
return retval;
}

View File

@@ -0,0 +1,34 @@
/*
Copyright (c) 2016, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_
#define PANDAS__LIBS_SRC_PARSER_IO_H_
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include "tokenizer.h"
#define FS(source) ((file_source *)source)
typedef struct _rd_source {
PyObject *obj;
PyObject *buffer;
size_t position;
} rd_source;
#define RDS(source) ((rd_source *)source)
void *new_rd_source(PyObject *obj);
int del_rd_source(void *src);
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors);
#endif // PANDAS__LIBS_SRC_PARSER_IO_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,236 @@
/*
Copyright (c) 2012, Lambda Foundry, Inc., except where noted
Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
BSD
See LICENSE for the license
*/
#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3
#include "../headers/stdint.h"
#include "../inline_helper.h"
#include "../headers/portable.h"
#include "khash.h"
#define STREAM_INIT_SIZE 32
#define REACHED_EOF 1
#define CALLING_READ_FAILED 2
/*
C flat file parsing low level code for pandas / NumPy
*/
/*
* Common set of error types for the read_rows() and tokenize()
* functions.
*/
// #define VERBOSE
#if defined(VERBOSE)
#define TRACE(X) printf X;
#else
#define TRACE(X)
#endif // VERBOSE
#define PARSER_OUT_OF_MEMORY -1
/*
* TODO: Might want to couple count_rows() with read_rows() to avoid
* duplication of some file I/O.
*/
typedef enum {
START_RECORD,
START_FIELD,
ESCAPED_CHAR,
IN_FIELD,
IN_QUOTED_FIELD,
ESCAPE_IN_QUOTED_FIELD,
QUOTE_IN_QUOTED_FIELD,
EAT_CRNL,
EAT_CRNL_NOP,
EAT_WHITESPACE,
EAT_COMMENT,
EAT_LINE_COMMENT,
WHITESPACE_LINE,
START_FIELD_IN_SKIP_LINE,
IN_FIELD_IN_SKIP_LINE,
IN_QUOTED_FIELD_IN_SKIP_LINE,
QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
FINISHED
} ParserState;
typedef enum {
QUOTE_MINIMAL,
QUOTE_ALL,
QUOTE_NONNUMERIC,
QUOTE_NONE
} QuoteStyle;
typedef enum {
ERROR,
WARN,
SKIP
} BadLineHandleMethod;
typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status, const char *encoding_errors);
typedef int (*io_cleanup)(void *src);
typedef struct parser_t {
void *source;
io_callback cb_io;
io_cleanup cb_cleanup;
int64_t chunksize; // Number of bytes to prepare for each chunk
char *data; // pointer to data to be processed
int64_t datalen; // amount of data available
int64_t datapos;
// where to write out tokenized data
char *stream;
uint64_t stream_len;
uint64_t stream_cap;
// Store words in (potentially ragged) matrix for now, hmm
char **words;
int64_t *word_starts; // where we are in the stream
uint64_t words_len;
uint64_t words_cap;
uint64_t max_words_cap; // maximum word cap encountered
char *pword_start; // pointer to stream start of current field
int64_t word_start; // position start of current field
int64_t *line_start; // position in words for start of line
int64_t *line_fields; // Number of fields in each line
uint64_t lines; // Number of (good) lines observed
uint64_t file_lines; // Number of lines (including bad or skipped)
uint64_t lines_cap; // Vector capacity
// Tokenizing stuff
ParserState state;
int doublequote; /* is " represented by ""? */
char delimiter; /* field separator */
int delim_whitespace; /* delimit by consuming space/tabs instead */
char quotechar; /* quote character */
char escapechar; /* escape character */
char lineterminator;
int skipinitialspace; /* ignore spaces following delimiter? */
int quoting; /* style of quoting to write */
char commentchar;
int allow_embedded_newline;
int usecols; // Boolean: 1: usecols provided, 0: none provided
Py_ssize_t expected_fields;
BadLineHandleMethod on_bad_lines;
// floating point options
char decimal;
char sci;
// thousands separator (comma, period)
char thousands;
int header; // Boolean: 1: has header, 0: no header
int64_t header_start; // header row start
uint64_t header_end; // header row end
void *skipset;
PyObject *skipfunc;
int64_t skip_first_N_rows;
int64_t skip_footer;
double (*double_converter)(const char *, char **,
char, char, char, int, int *, int *);
// error handling
char *warn_msg;
char *error_msg;
int skip_empty_lines;
} parser_t;
typedef struct coliter_t {
char **words;
int64_t *line_start;
int64_t col;
} coliter_t;
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start);
#define COLITER_NEXT(iter, word) \
do { \
const int64_t i = *iter.line_start++ + iter.col; \
word = i >= *iter.line_start ? "" : iter.words[i]; \
} while (0)
parser_t *parser_new(void);
int parser_init(parser_t *self);
int parser_consume_rows(parser_t *self, size_t nrows);
int parser_trim_buffers(parser_t *self);
int parser_add_skiprow(parser_t *self, int64_t row);
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
void parser_free(parser_t *self);
void parser_del(parser_t *self);
void parser_set_default_options(parser_t *self);
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
int tokenize_all_rows(parser_t *self, const char *encoding_errors);
// Have parsed / type-converted a chunk of data
// and want to free memory from the token stream
typedef struct uint_state {
int seen_sint;
int seen_uint;
int seen_null;
} uint_state;
void uint_state_init(uint_state *self);
int uint64_conflict(uint_state *self);
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep);
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
double precise_xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int);
// GH-15140 - round_trip requires and acquires the GIL on its own
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
int to_boolean(const char *item, uint8_t *val);
#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_

View File

@@ -0,0 +1,300 @@
/*
Copyright (c) 2016, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
Flexibly-sized, index-able skiplist data structure for maintaining a sorted
list of values
Port of Wes McKinney's Cython version of Raymond Hettinger's original pure
Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/)
*/
#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_
#define PANDAS__LIBS_SRC_SKIPLIST_H_
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "inline_helper.h"
PANDAS_INLINE float __skiplist_nanf(void) {
const union {
int __i;
float __f;
} __bint = {0x7fc00000UL};
return __bint.__f;
}
#define PANDAS_NAN ((double)__skiplist_nanf())
PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); }
typedef struct node_t node_t;
struct node_t {
node_t **next;
int *width;
double value;
int is_nil;
int levels;
int ref_count;
};
typedef struct {
node_t *head;
node_t **tmp_chain;
int *tmp_steps;
int size;
int maxlevels;
} skiplist_t;
PANDAS_INLINE double urand(void) {
return ((double)rand() + 1) / ((double)RAND_MAX + 2);
}
PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; }
PANDAS_INLINE node_t *node_init(double value, int levels) {
node_t *result;
result = (node_t *)malloc(sizeof(node_t));
if (result) {
result->value = value;
result->levels = levels;
result->is_nil = 0;
result->ref_count = 0;
result->next = (node_t **)malloc(levels * sizeof(node_t *));
result->width = (int *)malloc(levels * sizeof(int));
if (!(result->next && result->width) && (levels != 0)) {
free(result->next);
free(result->width);
free(result);
return NULL;
}
}
return result;
}
// do this ourselves
PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); }
PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); }
static void node_destroy(node_t *node) {
int i;
if (node) {
if (node->ref_count <= 1) {
for (i = 0; i < node->levels; ++i) {
node_destroy(node->next[i]);
}
free(node->next);
free(node->width);
// printf("Reference count was 1, freeing\n");
free(node);
} else {
node_decref(node);
}
// pretty sure that freeing the struct above will be enough
}
}
PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) {
if (skp) {
node_destroy(skp->head);
free(skp->tmp_steps);
free(skp->tmp_chain);
free(skp);
}
}
PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) {
skiplist_t *result;
node_t *NIL, *head;
int maxlevels, i;
maxlevels = 1 + Log2((double)expected_size);
result = (skiplist_t *)malloc(sizeof(skiplist_t));
if (!result) {
return NULL;
}
result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
result->maxlevels = maxlevels;
result->size = 0;
head = result->head = node_init(PANDAS_NAN, maxlevels);
NIL = node_init(0.0, 0);
if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
skiplist_destroy(result);
node_destroy(NIL);
return NULL;
}
node_incref(head);
NIL->is_nil = 1;
for (i = 0; i < maxlevels; ++i) {
head->next[i] = NIL;
head->width[i] = 1;
node_incref(NIL);
}
return result;
}
// 1 if left < right, 0 if left == right, -1 if left > right
PANDAS_INLINE int _node_cmp(node_t *node, double value) {
if (node->is_nil || node->value > value) {
return -1;
} else if (node->value < value) {
return 1;
} else {
return 0;
}
}
PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
node_t *node;
int level;
if (i < 0 || i >= skp->size) {
*ret = 0;
return 0;
}
node = skp->head;
++i;
for (level = skp->maxlevels - 1; level >= 0; --level) {
while (node->width[level] <= i) {
i -= node->width[level];
node = node->next[level];
}
}
*ret = 1;
return node->value;
}
// Returns the lowest rank of all elements with value `value`, as opposed to the
// highest rank returned by `skiplist_insert`.
PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
node_t *node;
int level, rank = 0;
node = skp->head;
for (level = skp->maxlevels - 1; level >= 0; --level) {
while (_node_cmp(node->next[level], value) > 0) {
rank += node->width[level];
node = node->next[level];
}
}
return rank + 1;
}
// Returns the rank of the inserted element. When there are duplicates,
// `rank` is the highest of the group, i.e. the 'max' method of
// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
node_t *node, *prevnode, *newnode, *next_at_level;
int *steps_at_level;
int size, steps, level, rank = 0;
node_t **chain;
chain = skp->tmp_chain;
steps_at_level = skp->tmp_steps;
memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
node = skp->head;
for (level = skp->maxlevels - 1; level >= 0; --level) {
next_at_level = node->next[level];
while (_node_cmp(next_at_level, value) >= 0) {
steps_at_level[level] += node->width[level];
rank += node->width[level];
node = next_at_level;
next_at_level = node->next[level];
}
chain[level] = node;
}
size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
newnode = node_init(value, size);
if (!newnode) {
return -1;
}
steps = 0;
for (level = 0; level < size; ++level) {
prevnode = chain[level];
newnode->next[level] = prevnode->next[level];
prevnode->next[level] = newnode;
node_incref(newnode); // increment the reference count
newnode->width[level] = prevnode->width[level] - steps;
prevnode->width[level] = steps + 1;
steps += steps_at_level[level];
}
for (level = size; level < skp->maxlevels; ++level) {
chain[level]->width[level] += 1;
}
++(skp->size);
return rank + 1;
}
PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {
int level, size;
node_t *node, *prevnode, *tmpnode, *next_at_level;
node_t **chain;
chain = skp->tmp_chain;
node = skp->head;
for (level = skp->maxlevels - 1; level >= 0; --level) {
next_at_level = node->next[level];
while (_node_cmp(next_at_level, value) > 0) {
node = next_at_level;
next_at_level = node->next[level];
}
chain[level] = node;
}
if (value != chain[0]->next[0]->value) {
return 0;
}
size = chain[0]->next[0]->levels;
for (level = 0; level < size; ++level) {
prevnode = chain[level];
tmpnode = prevnode->next[level];
prevnode->width[level] += tmpnode->width[level] - 1;
prevnode->next[level] = tmpnode->next[level];
tmpnode->next[level] = NULL;
node_destroy(tmpnode); // decrement refcount or free
}
for (level = size; level < skp->maxlevels; ++level) {
--(chain[level]->width[level]);
}
--(skp->size);
return 1;
}
#endif // PANDAS__LIBS_SRC_SKIPLIST_H_

View File

@@ -0,0 +1,316 @@
/*
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the ESN Social Software AB nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
Numeric decoder derived from from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
* Copyright (c) 1988-1993 The Regents of the University of California.
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
/*
Ultra fast JSON encoder and decoder
Developed by Jonas Tarnstrom (jonas@esn.me).
Encoder notes:
------------------
:: Cyclic references ::
Cyclic referenced objects are not detected.
Set JSONObjectEncoder.recursionMax to suitable value or make sure input object
tree doesn't have cyclic references.
*/
#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
#include <stdio.h>
#include <wchar.h>
// Don't output any extra whitespaces when encoding
#define JSON_NO_EXTRA_WHITESPACE
// Max decimals to encode double floating point numbers with
#ifndef JSON_DOUBLE_MAX_DECIMALS
#define JSON_DOUBLE_MAX_DECIMALS 15
#endif
// Max recursion depth, default for encoder
#ifndef JSON_MAX_RECURSION_DEPTH
#define JSON_MAX_RECURSION_DEPTH 1024
#endif
// Max recursion depth, default for decoder
#ifndef JSON_MAX_OBJECT_DEPTH
#define JSON_MAX_OBJECT_DEPTH 1024
#endif
/*
Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */
#ifndef JSON_MAX_STACK_BUFFER_SIZE
#define JSON_MAX_STACK_BUFFER_SIZE 131072
#endif
#ifdef _WIN32
typedef __int64 JSINT64;
typedef unsigned __int64 JSUINT64;
typedef __int32 JSINT32;
typedef unsigned __int32 JSUINT32;
typedef unsigned __int8 JSUINT8;
typedef unsigned __int16 JSUTF16;
typedef unsigned __int32 JSUTF32;
typedef __int64 JSLONG;
#define EXPORTFUNCTION __declspec(dllexport)
#define FASTCALL_MSVC __fastcall
#define INLINE_PREFIX static __inline
#else
#include <stdint.h>
typedef int64_t JSINT64;
typedef uint64_t JSUINT64;
typedef int32_t JSINT32;
typedef uint32_t JSUINT32;
#define FASTCALL_MSVC
#define INLINE_PREFIX static inline
typedef uint8_t JSUINT8;
typedef uint16_t JSUTF16;
typedef uint32_t JSUTF32;
typedef int64_t JSLONG;
#define EXPORTFUNCTION
#endif
#if !(defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__))
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define __LITTLE_ENDIAN__
#else
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define __BIG_ENDIAN__
#endif
#endif
#endif
#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
#error "Endianness not supported"
#endif
enum JSTYPES {
JT_NULL, // NULL
JT_TRUE, // boolean true
JT_FALSE, // boolean false
JT_INT, // (JSINT32 (signed 32-bit))
JT_LONG, // (JSINT64 (signed 64-bit))
JT_DOUBLE, // (double)
JT_BIGNUM, // integer larger than sys.maxsize
JT_UTF8, // (char 8-bit)
JT_ARRAY, // Array structure
JT_OBJECT, // Key/Value structure
JT_INVALID, // Internal, do not return nor expect
JT_POS_INF, // Positive infinity
JT_NEG_INF, // Negative infinity
};
typedef void * JSOBJ;
typedef void * JSITER;
typedef struct __JSONTypeContext {
int type;
void *encoder;
void *prv;
} JSONTypeContext;
/*
Function pointer declarations, suitable for implementing UltraJSON */
typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc);
typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc);
typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc);
typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc);
typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc,
size_t *outLen);
typedef void *(*JSPFN_MALLOC)(size_t size);
typedef void (*JSPFN_FREE)(void *pptr);
typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
typedef struct __JSONObjectEncoder {
void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc);
const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc,
size_t *_outLen);
JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
size_t *_outLen);
/*
Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)
Implementor should setup iteration state in ti->prv
*/
JSPFN_ITERBEGIN iterBegin;
/*
Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items.
Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this
*/
JSPFN_ITERNEXT iterNext;
/*
Ends the iteration of an iteratable object.
Any iteration state stored in ti->prv can be freed here
*/
JSPFN_ITEREND iterEnd;
/*
Returns a reference to the value object of an iterator
The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
*/
JSPFN_ITERGETVALUE iterGetValue;
/*
Return name of iterator.
The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
*/
JSPFN_ITERGETNAME iterGetName;
/*
Release a value as indicated by setting ti->release = 1 in the previous getValue call.
The ti->prv array should contain the necessary context to release the value
*/
void (*releaseObject)(JSOBJ obj);
/* Library functions
Set to NULL to use STDLIB malloc,realloc,free */
JSPFN_MALLOC malloc;
JSPFN_REALLOC realloc;
JSPFN_FREE free;
/*
Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/
int recursionMax;
/*
Configuration for max decimals of double floating point numbers to encode (0-9) */
int doublePrecision;
/*
If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
int forceASCII;
/*
If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
int encodeHTMLChars;
/*
Configuration for spaces of indent */
int indent;
/*
Set to an error message if error occurred */
const char *errorMsg;
JSOBJ errorObj;
/* Buffer stuff */
char *start;
char *offset;
char *end;
int heap;
int level;
} JSONObjectEncoder;
/*
Encode an object structure into JSON.
Arguments:
obj - An anonymous type representing the object
enc - Function definitions for querying JSOBJ type
buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
cbBuffer - Length of buffer (ignored if buffer is NULL)
Returns:
Encoded JSON object as a null terminated char string.
NOTE:
If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
Life cycle of the provided buffer must still be handled by caller.
If the return value doesn't equal the specified buffer caller must release the memory using
JSONObjectEncoder.free or free() as specified when calling this function.
*/
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc,
char *buffer, size_t cbBuffer);
typedef struct __JSONObjectDecoder {
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
JSOBJ (*newTrue)(void *prv);
JSOBJ (*newFalse)(void *prv);
JSOBJ (*newNull)(void *prv);
JSOBJ (*newPosInf)(void *prv);
JSOBJ (*newNegInf)(void *prv);
JSOBJ (*newObject)(void *prv, void *decoder);
JSOBJ (*endObject)(void *prv, JSOBJ obj);
JSOBJ (*newArray)(void *prv, void *decoder);
JSOBJ (*endArray)(void *prv, JSOBJ obj);
JSOBJ (*newInt)(void *prv, JSINT32 value);
JSOBJ (*newLong)(void *prv, JSINT64 value);
JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value);
JSOBJ (*newDouble)(void *prv, double value);
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
JSPFN_MALLOC malloc;
JSPFN_FREE free;
JSPFN_REALLOC realloc;
char *errorStr;
char *errorOffset;
int preciseFloat;
void *prv;
} JSONObjectDecoder;
EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec,
const char *buffer, size_t cbBuffer);
EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t);
#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,601 @@
/*
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the ESN Social Software AB nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
Numeric decoder derived from from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
* Copyright (c) 1988-1993 The Regents of the University of California.
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
#define NO_IMPORT_ARRAY
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <numpy/arrayobject.h>
#include <ultrajson.h>
#define PRINTMARK()
typedef struct __PyObjectDecoder {
JSONObjectDecoder dec;
void *npyarr; // Numpy context buffer
void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls
npy_intp curdim; // Current array dimension
PyArray_Descr *dtype;
} PyObjectDecoder;
typedef struct __NpyArrContext {
PyObject *ret;
PyObject *labels[2];
PyArray_Dims shape;
PyObjectDecoder *dec;
npy_intp i;
npy_intp elsize;
npy_intp elcount;
} NpyArrContext;
// Numpy handling based on numpy internal code, specifically the function
// PyArray_FromIter.
// numpy related functions are inter-dependent so declare them all here,
// to ensure the compiler catches any errors
// standard numpy array handling
JSOBJ Object_npyNewArray(void *prv, void *decoder);
JSOBJ Object_npyEndArray(void *prv, JSOBJ obj);
int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value);
// for more complex dtypes (object and string) fill a standard Python list
// and convert to a numpy array when done.
JSOBJ Object_npyNewArrayList(void *prv, void *decoder);
JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj);
int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value);
// labelled support, encode keys and values of JS object into separate numpy
// arrays
JSOBJ Object_npyNewObject(void *prv, void *decoder);
JSOBJ Object_npyEndObject(void *prv, JSOBJ obj);
int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
// free the numpy context buffer
void Npy_releaseContext(NpyArrContext *npyarr) {
PRINTMARK();
if (npyarr) {
if (npyarr->shape.ptr) {
PyObject_Free(npyarr->shape.ptr);
}
if (npyarr->dec) {
npyarr->dec->npyarr = NULL;
npyarr->dec->curdim = 0;
}
Py_XDECREF(npyarr->labels[0]);
Py_XDECREF(npyarr->labels[1]);
Py_XDECREF(npyarr->ret);
PyObject_Free(npyarr);
}
}
JSOBJ Object_npyNewArray(void *prv, void *_decoder) {
NpyArrContext *npyarr;
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
PRINTMARK();
if (decoder->curdim <= 0) {
// start of array - initialise the context buffer
npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext));
decoder->npyarr_addr = npyarr;
if (!npyarr) {
PyErr_NoMemory();
return NULL;
}
npyarr->dec = decoder;
npyarr->labels[0] = npyarr->labels[1] = NULL;
npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS);
npyarr->shape.len = 1;
npyarr->ret = NULL;
npyarr->elsize = 0;
npyarr->elcount = 4;
npyarr->i = 0;
} else {
// starting a new dimension continue the current array (and reshape
// after)
npyarr = (NpyArrContext *)decoder->npyarr;
if (decoder->curdim >= npyarr->shape.len) {
npyarr->shape.len++;
}
}
npyarr->shape.ptr[decoder->curdim] = 0;
decoder->curdim++;
return npyarr;
}
PyObject *Npy_returnLabelled(NpyArrContext *npyarr) {
PyObject *ret = npyarr->ret;
npy_intp i;
if (npyarr->labels[0] || npyarr->labels[1]) {
// finished decoding, build tuple with values and labels
ret = PyTuple_New(npyarr->shape.len + 1);
for (i = 0; i < npyarr->shape.len; i++) {
if (npyarr->labels[i]) {
PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]);
npyarr->labels[i] = NULL;
} else {
Py_INCREF(Py_None);
PyTuple_SET_ITEM(ret, i + 1, Py_None);
}
}
PyTuple_SET_ITEM(ret, 0, npyarr->ret);
}
return ret;
}
JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) {
PyObject *ret;
char *new_data;
NpyArrContext *npyarr = (NpyArrContext *)obj;
int emptyType = NPY_DEFAULT_TYPE;
npy_intp i;
PRINTMARK();
if (!npyarr) {
return NULL;
}
ret = npyarr->ret;
i = npyarr->i;
npyarr->dec->curdim--;
if (i == 0 || !npyarr->ret) {
// empty array would not have been initialised so do it now.
if (npyarr->dec->dtype) {
emptyType = npyarr->dec->dtype->type_num;
}
npyarr->ret = ret =
PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0);
} else if (npyarr->dec->curdim <= 0) {
// realloc to final size
new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize);
if (new_data == NULL) {
PyErr_NoMemory();
Npy_releaseContext(npyarr);
return NULL;
}
((PyArrayObject *)ret)->data = (void *)new_data;
// PyArray_BYTES(ret) = new_data;
}
if (npyarr->dec->curdim <= 0) {
// finished decoding array, reshape if necessary
if (npyarr->shape.len > 1) {
npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape,
NPY_ANYORDER);
Py_DECREF(ret);
}
ret = Npy_returnLabelled(npyarr);
npyarr->ret = NULL;
Npy_releaseContext(npyarr);
}
return ret;
}
int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
PyObject *type;
PyArray_Descr *dtype;
npy_intp i;
char *new_data, *item;
NpyArrContext *npyarr = (NpyArrContext *)obj;
PRINTMARK();
if (!npyarr) {
return 0;
}
i = npyarr->i;
npyarr->shape.ptr[npyarr->dec->curdim - 1]++;
if (PyArray_Check((PyObject *)value)) {
// multidimensional array, keep decoding values.
return 1;
}
if (!npyarr->ret) {
// Array not initialised yet.
// We do it here so we can 'sniff' the data type if none was provided
if (!npyarr->dec->dtype) {
type = PyObject_Type(value);
if (!PyArray_DescrConverter(type, &dtype)) {
Py_DECREF(type);
goto fail;
}
Py_INCREF(dtype);
Py_DECREF(type);
} else {
dtype = PyArray_DescrNew(npyarr->dec->dtype);
}
// If it's an object or string then fill a Python list and subsequently
// convert. Otherwise we would need to somehow mess about with
// reference counts when renewing memory.
npyarr->elsize = dtype->elsize;
if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) {
Py_XDECREF(dtype);
if (npyarr->dec->curdim > 1) {
PyErr_SetString(PyExc_ValueError,
"Cannot decode multidimensional arrays with "
"variable length elements to numpy");
goto fail;
}
npyarr->elcount = 0;
npyarr->ret = PyList_New(0);
if (!npyarr->ret) {
goto fail;
}
((JSONObjectDecoder *)npyarr->dec)->newArray =
Object_npyNewArrayList;
((JSONObjectDecoder *)npyarr->dec)->arrayAddItem =
Object_npyArrayListAddItem;
((JSONObjectDecoder *)npyarr->dec)->endArray =
Object_npyEndArrayList;
return Object_npyArrayListAddItem(prv, obj, value);
}
npyarr->ret = PyArray_NewFromDescr(
&PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL);
if (!npyarr->ret) {
goto fail;
}
}
if (i >= npyarr->elcount) {
// Grow PyArray_DATA(ret):
// this is similar for the strategy for PyListObject, but we use
// 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
if (npyarr->elsize == 0) {
PyErr_SetString(PyExc_ValueError,
"Cannot decode multidimensional arrays with "
"variable length elements to numpy");
goto fail;
}
npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) {
new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret),
npyarr->elcount * npyarr->elsize);
} else {
PyErr_NoMemory();
goto fail;
}
((PyArrayObject *)npyarr->ret)->data = (void *)new_data;
// PyArray_BYTES(npyarr->ret) = new_data;
}
PyArray_DIMS(npyarr->ret)[0] = i + 1;
if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL ||
PyArray_SETITEM(npyarr->ret, item, value) == -1) {
goto fail;
}
Py_DECREF((PyObject *)value);
npyarr->i++;
return 1;
fail:
Npy_releaseContext(npyarr);
return 0;
}
JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) {
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
PRINTMARK();
PyErr_SetString(
PyExc_ValueError,
"nesting not supported for object or variable length dtypes");
Npy_releaseContext(decoder->npyarr);
return NULL;
}
JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) {
PyObject *list, *ret;
NpyArrContext *npyarr = (NpyArrContext *)obj;
PRINTMARK();
if (!npyarr) {
return NULL;
}
// convert decoded list to numpy array
list = (PyObject *)npyarr->ret;
npyarr->ret = PyArray_FROM_O(list);
ret = Npy_returnLabelled(npyarr);
npyarr->ret = list;
((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray;
((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem;
((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray;
Npy_releaseContext(npyarr);
return ret;
}
int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) {
NpyArrContext *npyarr = (NpyArrContext *)obj;
PRINTMARK();
if (!npyarr) {
return 0;
}
PyList_Append((PyObject *)npyarr->ret, value);
Py_DECREF((PyObject *)value);
npyarr->elcount++;
return 1;
}
JSOBJ Object_npyNewObject(void *prv, void *_decoder) {
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
PRINTMARK();
if (decoder->curdim > 1) {
PyErr_SetString(PyExc_ValueError,
"labels only supported up to 2 dimensions");
return NULL;
}
return ((JSONObjectDecoder *)decoder)->newArray(prv, decoder);
}
JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
PyObject *list;
npy_intp labelidx;
NpyArrContext *npyarr = (NpyArrContext *)obj;
PRINTMARK();
if (!npyarr) {
return NULL;
}
labelidx = npyarr->dec->curdim - 1;
list = npyarr->labels[labelidx];
if (list) {
npyarr->labels[labelidx] = PyArray_FROM_O(list);
Py_DECREF(list);
}
return (PyObject *)((JSONObjectDecoder *)npyarr->dec)->endArray(prv, obj);
}
int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
PyObject *label, *labels;
npy_intp labelidx;
// add key to label array, value to values array
NpyArrContext *npyarr = (NpyArrContext *)obj;
PRINTMARK();
if (!npyarr) {
return 0;
}
label = (PyObject *)name;
labelidx = npyarr->dec->curdim - 1;
if (!npyarr->labels[labelidx]) {
npyarr->labels[labelidx] = PyList_New(0);
}
labels = npyarr->labels[labelidx];
// only fill label array once, assumes all column labels are the same
// for 2-dimensional arrays.
if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
PyList_Append(labels, label);
}
if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
Py_DECREF(label);
return 1;
}
return 0;
}
int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
int ret = PyDict_SetItem(obj, name, value);
Py_DECREF((PyObject *)name);
Py_DECREF((PyObject *)value);
return ret == 0 ? 1 : 0;
}
int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
int ret = PyList_Append(obj, value);
Py_DECREF((PyObject *)value);
return ret == 0 ? 1 : 0;
}
JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
return PyUnicode_FromWideChar(start, (end - start));
}
JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; }
JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; }
JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; }
JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); }
JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); }
JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); }
JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; }
JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); }
JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; }
JSOBJ Object_newInteger(void *prv, JSINT32 value) {
return PyLong_FromLong((long)value);
}
JSOBJ Object_newLong(void *prv, JSINT64 value) {
return PyLong_FromLongLong(value);
}
JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
return PyLong_FromUnsignedLongLong(value);
}
JSOBJ Object_newDouble(void *prv, double value) {
return PyFloat_FromDouble(value);
}
static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
if (obj != decoder->npyarr_addr) {
Py_XDECREF(((PyObject *)obj));
}
}
static char *g_kwlist[] = {"obj", "precise_float", "numpy",
"labelled", "dtype", NULL};
PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
PyObject *ret;
PyObject *sarg;
PyObject *arg;
PyObject *opreciseFloat = NULL;
JSONObjectDecoder *decoder;
PyObjectDecoder pyDecoder;
PyArray_Descr *dtype = NULL;
int numpy = 0, labelled = 0;
JSONObjectDecoder dec = {
Object_newString, Object_objectAddKey, Object_arrayAddItem,
Object_newTrue, Object_newFalse, Object_newNull,
Object_newPosInf, Object_newNegInf, Object_newObject,
Object_endObject, Object_newArray, Object_endArray,
Object_newInteger, Object_newLong, Object_newUnsignedLong,
Object_newDouble,
Object_releaseObject, PyObject_Malloc, PyObject_Free,
PyObject_Realloc};
dec.preciseFloat = 0;
dec.prv = NULL;
pyDecoder.dec = dec;
pyDecoder.curdim = 0;
pyDecoder.npyarr = NULL;
pyDecoder.npyarr_addr = NULL;
decoder = (JSONObjectDecoder *)&pyDecoder;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg,
&opreciseFloat, &numpy, &labelled,
PyArray_DescrConverter2, &dtype)) {
Npy_releaseContext(pyDecoder.npyarr);
return NULL;
}
if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) {
decoder->preciseFloat = 1;
}
if (PyBytes_Check(arg)) {
sarg = arg;
} else if (PyUnicode_Check(arg)) {
sarg = PyUnicode_AsUTF8String(arg);
if (sarg == NULL) {
// Exception raised above us by codec according to docs
return NULL;
}
} else {
PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'");
return NULL;
}
decoder->errorStr = NULL;
decoder->errorOffset = NULL;
if (numpy) {
pyDecoder.dtype = dtype;
decoder->newArray = Object_npyNewArray;
decoder->endArray = Object_npyEndArray;
decoder->arrayAddItem = Object_npyArrayAddItem;
if (labelled) {
decoder->newObject = Object_npyNewObject;
decoder->endObject = Object_npyEndObject;
decoder->objectAddKey = Object_npyObjectAddKey;
}
}
ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg),
PyBytes_GET_SIZE(sarg));
if (sarg != arg) {
Py_DECREF(sarg);
}
if (PyErr_Occurred()) {
if (ret) {
Py_DECREF((PyObject *)ret);
}
Npy_releaseContext(pyDecoder.npyarr);
return NULL;
}
if (decoder->errorStr) {
/*
FIXME: It's possible to give a much nicer error message here with actual
failing element in input etc*/
PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr);
if (ret) {
Py_DECREF((PyObject *)ret);
}
Npy_releaseContext(pyDecoder.npyarr);
return NULL;
}
return ret;
}

View File

@@ -0,0 +1,151 @@
/*
Copyright (c) 2020, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
// Conversion routines that are useful for serialization,
// but which don't interact with JSON objects directly
#include "date_conversions.h"
#include <../../../tslibs/src/datetime/np_datetime.h>
#include <../../../tslibs/src/datetime/np_datetime_strings.h>
/*
* Function: scaleNanosecToUnit
* -----------------------------
*
* Scales an integer value representing time in nanoseconds to provided unit.
*
* Mutates the provided value directly. Returns 0 on success, non-zero on error.
*/
int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
switch (unit) {
case NPY_FR_ns:
break;
case NPY_FR_us:
*value /= 1000LL;
break;
case NPY_FR_ms:
*value /= 1000000LL;
break;
case NPY_FR_s:
*value /= 1000000000LL;
break;
default:
return -1;
}
return 0;
}
/* Converts the int64_t representation of a datetime to ISO; mutates len */
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
npy_datetimestruct dts;
int ret_code;
pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
if (result == NULL) {
PyErr_NoMemory();
return NULL;
}
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
if (ret_code != 0) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
PyObject_Free(result);
}
// Note that get_datetime_iso_8601_strlen just gives a generic size
// for ISO string conversion, not the actual size used
*len = strlen(result);
return result;
}
npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
scaleNanosecToUnit(&dt, base);
return dt;
}
/* Convert PyDatetime To ISO C-string. mutates len */
char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
size_t *len) {
npy_datetimestruct dts;
int ret;
ret = convert_pydatetime_to_datetimestruct(obj, &dts);
if (ret != 0) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_ValueError,
"Could not convert PyDateTime to numpy datetime");
}
return NULL;
}
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
ret = make_iso_8601_datetime(&dts, result, *len, base);
if (ret != 0) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
PyObject_Free(result);
return NULL;
}
// Note that get_datetime_iso_8601_strlen just gives a generic size
// for ISO string conversion, not the actual size used
*len = strlen(result);
return result;
}
npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) {
npy_datetimestruct dts;
int ret;
ret = convert_pydatetime_to_datetimestruct(dt, &dts);
if (ret != 0) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_ValueError,
"Could not convert PyDateTime to numpy datetime");
}
// TODO(username): is setting errMsg required?
// ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
// return NULL;
}
npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
return NpyDateTimeToEpoch(npy_dt, base);
}
/* Converts the int64_t representation of a duration to ISO; mutates len */
char *int64ToIsoDuration(int64_t value, size_t *len) {
pandas_timedeltastruct tds;
int ret_code;
pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
// Max theoretical length of ISO Duration with 64 bit day
// as the largest unit is 70 characters + 1 for a null terminator
char *result = PyObject_Malloc(71);
if (result == NULL) {
PyErr_NoMemory();
return NULL;
}
ret_code = make_iso_8601_timedelta(&tds, result, len);
if (ret_code == -1) {
PyErr_SetString(PyExc_ValueError,
"Could not convert timedelta value to string");
PyObject_Free(result);
return NULL;
}
return result;
}

View File

@@ -0,0 +1,39 @@
/*
Copyright (c) 2020, PyData Development Team
All rights reserved.
Distributed under the terms of the BSD Simplified License.
The full license is in the LICENSE file, distributed with this software.
*/
#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
#define PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <numpy/ndarraytypes.h>
// Scales value inplace from nanosecond resolution to unit resolution
int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
// Converts an int64 object representing a date to ISO format
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
// TODO(username): this function doesn't do a lot; should augment or
// replace with scaleNanosecToUnit
npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
// Converts a Python object representing a Date / Datetime to ISO format
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len);
// Convert a Python Date/Datetime to Unix epoch with resolution base
npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base);
char *int64ToIsoDuration(int64_t value, size_t *len);
#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,81 @@
/*
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the ESN Social Software AB nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
Numeric decoder derived from from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
* Copyright (c) 1988-1993 The Regents of the University of California.
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
#include "version.h"
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
#include "numpy/arrayobject.h"
/* objToJSON */
PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs);
void initObjToJSON(void);
/* JSONToObj */
PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs);
#define ENCODER_HELP_TEXT \
"Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
"alter the maximum digit precision of doubles. Set " \
"encode_html_chars=True to encode < > & as unicode escape sequences."
static PyMethodDef ujsonMethods[] = {
{"encode", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
"Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
{"decode", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
"Converts JSON as string to dict object structure. Use precise_float=True "
"to use high precision float decoder."},
{"dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
"Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
{"loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
"Converts JSON as string to dict object structure. Use precise_float=True "
"to use high precision float decoder."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static PyModuleDef moduledef = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "_libjson",
.m_methods = ujsonMethods
};
PyMODINIT_FUNC PyInit_json(void) {
import_array()
initObjToJSON(); // TODO(username): clean up, maybe via tp_free?
return PyModuleDef_Init(&moduledef);
}

View File

@@ -0,0 +1,43 @@
/*
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the ESN Social Software AB nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
https://github.com/client9/stringencoders
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
Numeric decoder derived from from TCL library
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
* Copyright (c) 1988-1993 The Regents of the University of California.
* Copyright (c) 1994 Sun Microsystems, Inc.
*/
#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
#define UJSON_VERSION "1.33"
#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_

Some files were not shown because too many files have changed in this diff Show More