first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,243 @@
from __future__ import annotations
import numpy as np
from pandas._libs import lib
from pandas.core.dtypes.cast import maybe_downcast_numeric
from pandas.core.dtypes.common import (
ensure_object,
is_datetime_or_timedelta_dtype,
is_decimal,
is_integer_dtype,
is_number,
is_numeric_dtype,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import (
ABCIndex,
ABCSeries,
)
import pandas as pd
from pandas.core.arrays.numeric import NumericArray
def to_numeric(arg, errors="raise", downcast=None):
"""
Convert argument to a numeric type.
The default return dtype is `float64` or `int64`
depending on the data supplied. Use the `downcast` parameter
to obtain other dtypes.
Please note that precision loss may occur if really large numbers
are passed in. Due to the internal limitations of `ndarray`, if
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
passed in, it is very likely they will be converted to float so that
they can stored in an `ndarray`. These warnings apply similarly to
`Series` since it internally leverages `ndarray`.
Parameters
----------
arg : scalar, list, tuple, 1-d array, or Series
Argument to be converted.
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- If 'raise', then invalid parsing will raise an exception.
- If 'coerce', then invalid parsing will be set as NaN.
- If 'ignore', then invalid parsing will return the input.
downcast : str, default None
Can be 'integer', 'signed', 'unsigned', or 'float'.
If not None, and if the data has been successfully cast to a
numerical dtype (or if the data was numeric to begin with),
downcast that resulting data to the smallest numerical dtype
possible according to the following rules:
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
- 'unsigned': smallest unsigned int dtype (min.: np.uint8)
- 'float': smallest float dtype (min.: np.float32)
As this behaviour is separate from the core conversion to
numeric values, any errors raised during the downcasting
will be surfaced regardless of the value of the 'errors' input.
In addition, downcasting will only occur if the size
of the resulting data's dtype is strictly larger than
the dtype it is to be cast to, so if none of the dtypes
checked satisfy that specification, no downcasting will be
performed on the data.
Returns
-------
ret
Numeric if parsing succeeded.
Return type depends on input. Series if Series, otherwise ndarray.
See Also
--------
DataFrame.astype : Cast argument to a specified dtype.
to_datetime : Convert argument to datetime.
to_timedelta : Convert argument to timedelta.
numpy.ndarray.astype : Cast a numpy array to a specified type.
DataFrame.convert_dtypes : Convert dtypes.
Examples
--------
Take separate series and convert to numeric, coercing when told to
>>> s = pd.Series(['1.0', '2', -3])
>>> pd.to_numeric(s)
0 1.0
1 2.0
2 -3.0
dtype: float64
>>> pd.to_numeric(s, downcast='float')
0 1.0
1 2.0
2 -3.0
dtype: float32
>>> pd.to_numeric(s, downcast='signed')
0 1
1 2
2 -3
dtype: int8
>>> s = pd.Series(['apple', '1.0', '2', -3])
>>> pd.to_numeric(s, errors='ignore')
0 apple
1 1.0
2 2
3 -3
dtype: object
>>> pd.to_numeric(s, errors='coerce')
0 NaN
1 1.0
2 2.0
3 -3.0
dtype: float64
Downcasting of nullable integer and floating dtypes is supported:
>>> s = pd.Series([1, 2, 3], dtype="Int64")
>>> pd.to_numeric(s, downcast="integer")
0 1
1 2
2 3
dtype: Int8
>>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
>>> pd.to_numeric(s, downcast="float")
0 1.0
1 2.1
2 3.0
dtype: Float32
"""
if downcast not in (None, "integer", "signed", "unsigned", "float"):
raise ValueError("invalid downcasting method provided")
if errors not in ("ignore", "raise", "coerce"):
raise ValueError("invalid error value specified")
is_series = False
is_index = False
is_scalars = False
if isinstance(arg, ABCSeries):
is_series = True
values = arg.values
elif isinstance(arg, ABCIndex):
is_index = True
if needs_i8_conversion(arg.dtype):
values = arg.asi8
else:
values = arg.values
elif isinstance(arg, (list, tuple)):
values = np.array(arg, dtype="O")
elif is_scalar(arg):
if is_decimal(arg):
return float(arg)
if is_number(arg):
return arg
is_scalars = True
values = np.array([arg], dtype="O")
elif getattr(arg, "ndim", 1) > 1:
raise TypeError("arg must be a list, tuple, 1-d array, or Series")
else:
values = arg
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
# save mask to reconstruct the full array after casting
mask: np.ndarray | None = None
if isinstance(values, NumericArray):
mask = values._mask
values = values._data[~mask]
values_dtype = getattr(values, "dtype", None)
if is_numeric_dtype(values_dtype):
pass
elif is_datetime_or_timedelta_dtype(values_dtype):
values = values.view(np.int64)
else:
values = ensure_object(values)
coerce_numeric = errors not in ("ignore", "raise")
try:
values, _ = lib.maybe_convert_numeric(
values, set(), coerce_numeric=coerce_numeric
)
except (ValueError, TypeError):
if errors == "raise":
raise
# attempt downcast only if the data has been successfully converted
# to a numerical dtype and if a downcast method has been specified
if downcast is not None and is_numeric_dtype(values.dtype):
typecodes: str | None = None
if downcast in ("integer", "signed"):
typecodes = np.typecodes["Integer"]
elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
typecodes = np.typecodes["UnsignedInteger"]
elif downcast == "float":
typecodes = np.typecodes["Float"]
# pandas support goes only to np.float32,
# as float dtypes smaller than that are
# extremely rare and not well supported
float_32_char = np.dtype(np.float32).char
float_32_ind = typecodes.index(float_32_char)
typecodes = typecodes[float_32_ind:]
if typecodes is not None:
# from smallest to largest
for typecode in typecodes:
dtype = np.dtype(typecode)
if dtype.itemsize <= values.dtype.itemsize:
values = maybe_downcast_numeric(values, dtype)
# successful conversion
if values.dtype == dtype:
break
# GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
if mask is not None:
data = np.zeros(mask.shape, dtype=values.dtype)
data[~mask] = values
from pandas.core.arrays import (
FloatingArray,
IntegerArray,
)
klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
values = klass(data, mask.copy())
if is_series:
return arg._constructor(values, index=arg.index, name=arg.name)
elif is_index:
# because we want to coerce to numeric if possible,
# do not use _shallow_copy
return pd.Index(values, name=arg.name)
elif is_scalars:
return values[0]
else:
return values

View File

@ -0,0 +1,196 @@
"""
timedelta support tools
"""
from __future__ import annotations
import numpy as np
from pandas._libs import lib
from pandas._libs.tslibs import (
NaT,
NaTType,
)
from pandas._libs.tslibs.timedeltas import (
Timedelta,
parse_timedelta_unit,
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import (
ABCIndex,
ABCSeries,
)
from pandas.core.arrays.timedeltas import sequence_to_td64ns
def to_timedelta(arg, unit=None, errors="raise"):
"""
Convert argument to timedelta.
Timedeltas are absolute differences in times, expressed in difference
units (e.g. days, hours, minutes, seconds). This method converts
an argument from a recognized timedelta format / value into
a Timedelta type.
Parameters
----------
arg : str, timedelta, list-like or Series
The data to be converted to timedelta.
.. deprecated:: 1.2
Strings with units 'M', 'Y' and 'y' do not represent
unambiguous timedelta values and will be removed in a future version
unit : str, optional
Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``.
Possible values:
* 'W'
* 'D' / 'days' / 'day'
* 'hours' / 'hour' / 'hr' / 'h'
* 'm' / 'minute' / 'min' / 'minutes' / 'T'
* 'S' / 'seconds' / 'sec' / 'second'
* 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L'
* 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U'
* 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N'
.. versionchanged:: 1.1.0
Must not be specified when `arg` context strings and
``errors="raise"``.
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- If 'raise', then invalid parsing will raise an exception.
- If 'coerce', then invalid parsing will be set as NaT.
- If 'ignore', then invalid parsing will return the input.
Returns
-------
timedelta
If parsing succeeded.
Return type depends on input:
- list-like: TimedeltaIndex of timedelta64 dtype
- Series: Series of timedelta64 dtype
- scalar: Timedelta
See Also
--------
DataFrame.astype : Cast argument to a specified dtype.
to_datetime : Convert argument to datetime.
convert_dtypes : Convert dtypes.
Notes
-----
If the precision is higher than nanoseconds, the precision of the duration is
truncated to nanoseconds for string inputs.
Examples
--------
Parsing a single string to a Timedelta:
>>> pd.to_timedelta('1 days 06:05:01.00003')
Timedelta('1 days 06:05:01.000030')
>>> pd.to_timedelta('15.5us')
Timedelta('0 days 00:00:00.000015500')
Parsing a list or array of strings:
>>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan'])
TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT],
dtype='timedelta64[ns]', freq=None)
Converting numbers by specifying the `unit` keyword argument:
>>> pd.to_timedelta(np.arange(5), unit='s')
TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02',
'0 days 00:00:03', '0 days 00:00:04'],
dtype='timedelta64[ns]', freq=None)
>>> pd.to_timedelta(np.arange(5), unit='d')
TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
dtype='timedelta64[ns]', freq=None)
"""
if unit is not None:
unit = parse_timedelta_unit(unit)
if errors not in ("ignore", "raise", "coerce"):
raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.")
if unit in {"Y", "y", "M"}:
raise ValueError(
"Units 'M', 'Y', and 'y' are no longer supported, as they do not "
"represent unambiguous timedelta values durations."
)
if arg is None:
return arg
elif isinstance(arg, ABCSeries):
values = _convert_listlike(arg._values, unit=unit, errors=errors)
return arg._constructor(values, index=arg.index, name=arg.name)
elif isinstance(arg, ABCIndex):
return _convert_listlike(arg, unit=unit, errors=errors, name=arg.name)
elif isinstance(arg, np.ndarray) and arg.ndim == 0:
# extract array scalar and process below
arg = lib.item_from_zerodim(arg)
elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1:
return _convert_listlike(arg, unit=unit, errors=errors)
elif getattr(arg, "ndim", 1) > 1:
raise TypeError(
"arg must be a string, timedelta, list, tuple, 1-d array, or Series"
)
if isinstance(arg, str) and unit is not None:
raise ValueError("unit must not be specified if the input is/contains a str")
# ...so it must be a scalar value. Return scalar.
return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors)
def _coerce_scalar_to_timedelta_type(r, unit="ns", errors="raise"):
"""Convert string 'r' to a timedelta object."""
result: Timedelta | NaTType
try:
result = Timedelta(r, unit)
except ValueError:
if errors == "raise":
raise
elif errors == "ignore":
return r
# coerce
result = NaT
return result
def _convert_listlike(arg, unit=None, errors="raise", name=None):
"""Convert a list of objects to a timedelta index object."""
if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"):
# This is needed only to ensure that in the case where we end up
# returning arg (errors == "ignore"), and where the input is a
# generator, we return a useful list-like instead of a
# used-up generator
arg = np.array(list(arg), dtype=object)
try:
td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0]
except ValueError:
if errors == "ignore":
return arg
else:
# This else-block accounts for the cases when errors='raise'
# and errors='coerce'. If errors == 'raise', these errors
# should be raised. If errors == 'coerce', we shouldn't
# expect any errors to be raised, since all parsing errors
# cause coercion to pd.NaT. However, if an error / bug is
# introduced that causes an Exception to be raised, we would
# like to surface it.
raise
from pandas import TimedeltaIndex
value = TimedeltaIndex(td64arr, unit="ns", name=name)
return value

View File

@ -0,0 +1,146 @@
from __future__ import annotations
from datetime import (
datetime,
time,
)
import numpy as np
from pandas._libs.lib import is_list_like
from pandas.core.dtypes.generic import (
ABCIndex,
ABCSeries,
)
from pandas.core.dtypes.missing import notna
def to_time(arg, format=None, infer_time_format=False, errors="raise"):
"""
Parse time strings to time objects using fixed strptime formats ("%H:%M",
"%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
"%I%M%S%p")
Use infer_time_format if all the strings are in the same format to speed
up conversion.
Parameters
----------
arg : string in time format, datetime.time, list, tuple, 1-d array, Series
format : str, default None
Format used to convert arg into a time object. If None, fixed formats
are used.
infer_time_format: bool, default False
Infer the time format based on the first non-NaN element. If all
strings are in the same format, this will speed up conversion.
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- If 'raise', then invalid parsing will raise an exception
- If 'coerce', then invalid parsing will be set as None
- If 'ignore', then invalid parsing will return the input
Returns
-------
datetime.time
"""
def _convert_listlike(arg, format):
if isinstance(arg, (list, tuple)):
arg = np.array(arg, dtype="O")
elif getattr(arg, "ndim", 1) > 1:
raise TypeError(
"arg must be a string, datetime, list, tuple, 1-d array, or Series"
)
arg = np.asarray(arg, dtype="O")
if infer_time_format and format is None:
format = _guess_time_format_for_array(arg)
times: list[time | None] = []
if format is not None:
for element in arg:
try:
times.append(datetime.strptime(element, format).time())
except (ValueError, TypeError) as err:
if errors == "raise":
msg = (
f"Cannot convert {element} to a time with given "
f"format {format}"
)
raise ValueError(msg) from err
elif errors == "ignore":
return arg
else:
times.append(None)
else:
formats = _time_formats[:]
format_found = False
for element in arg:
time_object = None
for time_format in formats:
try:
time_object = datetime.strptime(element, time_format).time()
if not format_found:
# Put the found format in front
fmt = formats.pop(formats.index(time_format))
formats.insert(0, fmt)
format_found = True
break
except (ValueError, TypeError):
continue
if time_object is not None:
times.append(time_object)
elif errors == "raise":
raise ValueError(f"Cannot convert arg {arg} to a time")
elif errors == "ignore":
return arg
else:
times.append(None)
return times
if arg is None:
return arg
elif isinstance(arg, time):
return arg
elif isinstance(arg, ABCSeries):
values = _convert_listlike(arg._values, format)
return arg._constructor(values, index=arg.index, name=arg.name)
elif isinstance(arg, ABCIndex):
return _convert_listlike(arg, format)
elif is_list_like(arg):
return _convert_listlike(arg, format)
return _convert_listlike(np.array([arg]), format)[0]
# Fixed time formats for time parsing
_time_formats = [
"%H:%M",
"%H%M",
"%I:%M%p",
"%I%M%p",
"%H:%M:%S",
"%H%M%S",
"%I:%M:%S%p",
"%I%M%S%p",
]
def _guess_time_format_for_array(arr):
# Try to guess the format based on the first non-NaN element
non_nan_elements = notna(arr).nonzero()[0]
if len(non_nan_elements):
element = arr[non_nan_elements[0]]
for time_format in _time_formats:
try:
datetime.strptime(element, time_format)
return time_format
except ValueError:
pass
return None