Source code for slide.utils

from typing import (
    Any,
    Callable,
    Dict,
    Generic,
    Iterable,
    List,
    Optional,
    Tuple,
    TypeVar,
)

import numpy as np
import pandas as pd
import pyarrow as pa
from triad.utils.assertion import assert_or_throw
from triad.utils.pyarrow import (
    TRIAD_DEFAULT_TIMESTAMP,
    apply_schema,
    to_pa_datatype,
    to_single_pandas_dtype,
)

from slide._string_utils import LikeExpr, LikeExprShortcut
from slide.exceptions import SlideCastError, SlideIndexIncompatibleError

TDf = TypeVar("TDf", bound=Any)
TCol = TypeVar("TCol", bound=Any)
_ANTI_INDICATOR = "__anti_indicator__"
_CROSS_INDICATOR = "__corss_indicator__"


[docs]def parse_join_type(join_type: str) -> str:
    """Parse and normalize join type string. The normalization
    will lower the string, remove all space and ``_``, and then
    map to the limited options.

    Here are the options after normalization: ``inner``, ``cross``,
    ``left_semi``, ``left_anti``, ``left_outer``, ``right_outer``,
    ``full_outer``.

    :param join_type: the raw join type string
    :raises NotImplementedError: if not supported
    :return: the normalized join type string
    """
    join_type = join_type.replace(" ", "").replace("_", "").lower()
    if join_type in ["inner", "cross"]:
        return join_type
    if join_type in ["inner", "join"]:
        return "inner"
    if join_type in ["semi", "leftsemi"]:
        return "left_semi"
    if join_type in ["anti", "leftanti"]:
        return "left_anti"
    if join_type in ["left", "leftouter"]:
        return "left_outer"
    if join_type in ["right", "rightouter"]:
        return "right_outer"
    if join_type in ["outer", "full", "fullouter"]:
        return "full_outer"
    raise NotImplementedError(join_type)


[docs]class SlideUtils(Generic[TDf, TCol]):
    """A collection of utils for general pandas like dataframes"""

[docs]    def to_safe_pa_type(self, tp: Any) -> pa.DataType:
        if isinstance(tp, (np.dtype, pd.api.extensions.ExtensionDtype)):
            if pd.api.types.is_datetime64_any_dtype(tp):
                return TRIAD_DEFAULT_TIMESTAMP
            if pd.api.types.is_object_dtype(tp):
                return pa.string()
            if pd.__version__ >= "1.2":
                if pd.Float64Dtype() == tp:
                    return pa.float64()
                if pd.Float32Dtype() == tp:
                    return pa.float32()
        return to_pa_datatype(tp)

[docs]    def is_series(self, obj: Any) -> bool:  # pragma: no cover
        """Check whether is a series type

        :param obj: the object
        :return: whether it is a series
        """
        raise NotImplementedError

[docs]    def to_series(self, obj: Any, name: Optional[str] = None) -> TCol:
        """Convert an object to series

        :param obj: the object
        :param name: name of the series, defaults to None
        :return: the series
        """
        raise NotImplementedError  # pragma: no cover

[docs]    def series_to_array(self, col: TCol) -> List[Any]:
        """Convert a series to numpy array

        :param col: the series
        :return: the numpy array
        """
        raise NotImplementedError  # pragma: no cover

[docs]    def to_constant_series(
        self,
        constant: Any,
        from_series: TCol,
        dtype: Any = None,
        name: Optional[str] = None,
    ) -> TCol:  # pragma: no cover
        """Convert a constant to a series with the same index of ``from_series``

        :param constant: the constant
        :param from_series: the reference series for index
        :param dtype: default data type, defaults to None
        :param name: name of the series, defaults to None
        :return: the series
        """
        raise NotImplementedError

[docs]    def get_col_pa_type(self, col: Any) -> pa.DataType:
        """Get column or constant pyarrow data type

        :param col: the column or the constant
        :return: pyarrow data type
        """
        if self.is_series(col):
            tp = col.dtype
            if tp == np.dtype("object") or tp == np.dtype(str):
                return pa.string()
            return self.to_safe_pa_type(tp)
        return self.to_safe_pa_type(type(col))

[docs]    def unary_arithmetic_op(self, col: Any, op: str) -> Any:
        """Unary arithmetic operator on series/constants

        :param col: a series or a constant
        :param op: can be ``+`` or ``-``
        :return: the transformed series or constant
        :raises NotImplementedError: if ``op`` is not supported

        .. note:

        All behaviors should be consistent with SQL correspondent operations.
        """
        if op == "+":
            return col
        if op == "-":
            return 0 - col
        raise NotImplementedError(f"{op} is not supported")  # pragma: no cover

[docs]    def binary_arithmetic_op(self, col1: Any, col2: Any, op: str) -> Any:
        """Binary arithmetic operations ``+``, ``-``, ``*``, ``/``

        :param col1: the first column (series or constant)
        :param col2: the second column (series or constant)
        :param op: ``+``, ``-``, ``*``, ``/``
        :return: the result after the operation (series or constant)
        :raises NotImplementedError: if ``op`` is not supported

        .. note:

        All behaviors should be consistent with SQL correspondent operations.
        """
        if op == "+":
            return col1 + col2
        if op == "-":
            return col1 - col2
        if op == "*":
            return col1 * col2
        if op == "/":
            return col1 / col2
        raise NotImplementedError(f"{op} is not supported")  # pragma: no cover

[docs]    def comparison_op(self, col1: Any, col2: Any, op: str) -> Any:
        """Binary comparison ``<``, ``<=``, ``==``, ``>``, ``>=``

        :param col1: the first column (series or constant)
        :param col2: the second column (series or constant)
        :param op: ``<``, ``<=``, ``==``, ``>``, ``>=``
        :return: the result after the operation (series or constant)
        :raises NotImplementedError: if ``op`` is not supported

        .. note:

        All behaviors should be consistent with SQL correspondent operations.
        """

        if col1 is None and col2 is None:
            return None
        if op == "==":
            s: Any = col1 == col2
        elif op == "!=":
            s = col1 != col2
        elif op == "<":
            s = col1 < col2
        elif op == "<=":
            s = col1 <= col2
        elif op == ">":
            s = col1 > col2
        elif op == ">=":
            s = col1 >= col2
        else:  # pragma: no cover
            raise NotImplementedError(f"{op} is not supported")
        return self._set_op_result_to_none(s, col1, col2)

[docs]    def binary_logical_op(self, col1: Any, col2: Any, op: str) -> Any:
        """Binary logical operations ``and``, ``or``

        :param col1: the first column (series or constant)
        :param col2: the second column (series or constant)
        :param op: ``and``, ``or``
        :return: the result after the operation (series or constant)
        :raises NotImplementedError: if ``op`` is not supported

        .. note:

        All behaviors should be consistent with SQL correspondent operations.
        """
        c1 = self._safe_bool(col1)
        c2 = self._safe_bool(col2)
        if op == "and":
            if not self.is_series(c1) and not self.is_series(c2):
                return c1 and c2
            return c1 & c2
        elif op == "or":
            if not self.is_series(c1) and not self.is_series(c2):
                return c1 or c2
            return c1 | c2
        raise NotImplementedError(f"{op} is not supported")  # pragma: no cover

[docs]    def logical_not(self, col: Any) -> Any:
        """Logical ``NOT``

        .. note:

        All behaviors should be consistent with SQL correspondent operations.
        """
        b = self._safe_bool(col)
        if self.is_series(b):
            return ~b
        return None if b is None else not b

[docs]    def cast(  # noqa: C901
        self, col: Any, type_obj: Any, input_type: Any = None
    ) -> Any:
        """Cast ``col`` to a new type. ``type_obj`` must be
        able to be converted by :func:`~triad.utils.pyarrow.self.to_safe_pa_type`.

        :param col: a series or a constant
        :param type_obj: an objected that can be accepted by
            :func:`~triad.utils.pyarrow.self.to_safe_pa_type`
        :param input_type: an objected that is either None or to be accepted by
            :func:`~triad.utils.pyarrow.self.to_safe_pa_type`, defaults to None.
        :return: the new column or constant

        .. note:

        If ``input_type`` is not None, then it can be used to determine
        the casting behavior. This can be useful when the input is boolean with
        nulls or strings, where the pandas dtype may not provide the accurate
        type information.
        """
        to_type = self.to_safe_pa_type(type_obj)
        t_type = to_single_pandas_dtype(to_type, use_extension_types=True)
        try:
            if self.is_series(col):
                try:
                    inf_type = self.get_col_pa_type(col)
                    has_hint = input_type is not None
                    from_type = (
                        inf_type if not has_hint else self.to_safe_pa_type(input_type)
                    )
                    if pa.types.is_string(to_type):
                        if (
                            has_hint
                            and pa.types.is_string(from_type)
                            and pa.types.is_string(inf_type)
                        ):
                            return col
                    elif from_type == inf_type == to_type:
                        return col
                except Exception:  # pragma: no cover
                    return col.astype(t_type)
                if pa.types.is_boolean(to_type):
                    return self._cast_to_bool(col, from_type, inf_type, t_type)
                if pa.types.is_integer(to_type):
                    return self._cast_to_int(col, from_type, inf_type, t_type)
                elif pa.types.is_floating(to_type):
                    return self._cast_to_float(col, from_type, inf_type, t_type)
                elif pa.types.is_timestamp(to_type):
                    return self._cast_to_datetime(col, from_type, inf_type, t_type)
                elif pa.types.is_date(to_type):
                    return self._cast_to_date(col, from_type, inf_type, t_type)
                elif pa.types.is_string(to_type):
                    return self._cast_to_str(col, from_type, inf_type, t_type)
                return col.astype(t_type)
            else:
                if col is None:
                    return None
                from_type = (
                    self.get_col_pa_type(col) if input_type is None else input_type
                )
                res = self.series_to_array(
                    self.cast(
                        self.to_series([col]),
                        type_obj=type_obj,
                        input_type=from_type,
                    )
                )[0]
                if pd.isna(res):
                    return None
                return res
        except (TypeError, ValueError) as te:
            raise SlideCastError(f"unable to cast from {from_type} to {t_type}") from te

[docs]    def filter_df(self, df: TDf, cond: Any) -> TDf:
        """Filter dataframe by a boolean series or a constant

        :param df: the dataframe
        :param cond: a boolean seris or a constant
        :return: the filtered dataframe

        .. note:

        Filtering behavior should be consistent with SQL.
        """
        c = self._safe_bool(cond)
        if self.is_series(c):
            return df[c]
        elif c:
            return df
        else:
            return df.head(0)

[docs]    def is_value(self, col: Any, value: Any, positive: bool = True) -> Any:
        """Check if the series or constant is ``value``

        :param col: the series or constant
        :param value: ``None``, ``True`` or ``False``
        :param positive: check ``is value`` or ``is not value``,
            defaults to True (``is value``)
        :raises NotImplementedError: if value is not supported
        :return: a bool value or a series
        """
        if self.is_series(col):
            if value is None:
                if positive:
                    return col.isnull()
                else:
                    return ~col.isnull()
            elif isinstance(value, bool) and value:
                if positive:
                    return (col != 0) & (~col.isnull())
                else:
                    return (col == 0) | col.isnull()
            elif isinstance(value, bool) and not value:
                if positive:
                    return (col == 0) & (~col.isnull())
                else:
                    return (col != 0) | col.isnull()
            raise NotImplementedError(value)
        else:
            return self.series_to_array(
                self.is_value(self.to_series([col]), value=value, positive=positive)
            )[0]

[docs]    def is_in(self, col: Any, values: List[Any], positive: bool) -> Any:  # noqa: C901
        """Check if a series or a constant is in ``values``

        :param col: the series or the constant
        :param values: a list of constants and series (can mix)
        :param positive: ``is in`` or ``is not in``
        :return: the correspondent boolean series or constant

        .. note:

        This behavior should be consistent with SQL ``IN`` and ``NOT IN``.
        The return values can be ``True``, ``False`` and ``None``
        """
        if self.is_series(col):
            cols = [x for x in values if self.is_series(x)]
            others = [x for x in values if not self.is_series(x)]
            has_null_constant = any(pd.isna(x) for x in others)
            innulls: Any = None
            if positive:
                o: Any = col.isin(others)
                for c in cols:
                    o = o | (col == c)
                    if not has_null_constant:
                        if innulls is None:
                            innulls = c.isnull()
                        else:
                            innulls = innulls | c.isnull()
            else:
                o = ~col.isin(others)
                for c in cols:
                    o = o & (col != c)
                    if not has_null_constant:
                        if innulls is None:
                            innulls = c.isnull()
                        else:
                            innulls = innulls | c.isnull()
            if has_null_constant:
                o = o.mask(o == (0 if positive else 1), None)
            elif innulls is not None:
                o = o.mask(innulls & (o == (0 if positive else 1)), None)
            return o.mask(col.isnull(), None)
        else:
            res = self.series_to_array(
                self.is_in(self.to_series([col]), values=values, positive=positive)
            )[0]
            return None if pd.isna(res) else bool(res)

[docs]    def is_between(self, col: Any, lower: Any, upper: Any, positive: bool) -> Any:
        """Check if a series or a constant is ``>=lower`` and ``<=upper``

        :param col: the series or the constant
        :param lower: the lower bound, which can be series or a constant
        :param upper: the upper bound, which can be series or a constant
        :param positive: ``is between`` or ``is not between``
        :return: the correspondent boolean series or constant

        .. note:

        This behavior should be consistent with SQL ``BETWEEN`` and ``NOT BETWEEN``.
        The return values can be ``True``, ``False`` and ``None``
        """
        if col is None:
            return None
        if self.is_series(col):
            left = (
                self.to_constant_series(False, col)
                if lower is None
                else (lower <= col).fillna(False)
            )
            right = (
                self.to_constant_series(False, col)
                if upper is None
                else (col <= upper).fillna(False)
            )
            ln = lower.isnull() if self.is_series(lower) else lower is None
            un = upper.isnull() if self.is_series(upper) else upper is None
            s: Any = left & right
            s = s.mask(col.isnull() | (ln & un), None)
            if self.is_series(lower) or lower is None:
                s = s.mask(right & ln, None)
            if self.is_series(upper) or upper is None:
                s = s.mask(left & un, None)

            if positive:
                return s
            return (s == 0).mask(s.isnull(), None)
        else:
            res = self.series_to_array(
                self.is_between(
                    self.to_series([col]),
                    lower=lower
                    if lower is None or self.is_series(lower)
                    else self.to_series([lower]),
                    upper=upper
                    if upper is None or self.is_series(upper)
                    else self.to_series([upper]),
                    positive=positive,
                )
            )[0]
            return None if pd.isna(res) else bool(res)

[docs]    def coalesce(self, cols: List[Any]) -> Any:
        """Coalesce multiple series and constants

        :param cols: the collection of series and constants in order
        :return: the coalesced series or constant

        .. note:

        This behavior should be consistent with SQL ``COALESCE``
        """
        if any(self.is_series(s) for s in cols):
            tmp = self.cols_to_df(cols, [f"_{n}" for n in range(len(cols))])
            return tmp.fillna(method="bfill", axis=1)["_0"]
        for x in cols:
            if x is not None:
                return x
        return None

[docs]    def case_when(self, *pairs: Tuple[Any, Any], default: Any = None) -> Any:
        """SQL ``CASE WHEN``

        :param pairs: condition and value pairs, both can be either a
            series or a constant
        :param default: default value if none of the conditions satisfies,
            defaults to None
        :return: the final series or constant

        .. note:

        This behavior should be consistent with SQL ``CASE WHEN``
        """

        def _safe_pos(s: Any) -> Any:
            if self.is_series(s):
                return (~(s.isnull())) & (s != 0)
            return not pd.isna(s) and s != 0

        def get_series() -> Iterable[Tuple[str, Any]]:
            for n in range(len(pairs)):
                yield f"w_{n}", _safe_pos(pairs[n][0])
                yield f"t_{n}", pairs[n][1]
            yield "d", default

        all_series = list(get_series())
        if any(self.is_series(x[1]) for x in all_series):
            tmp = self.cols_to_df(
                [x[1] for x in all_series], names=[x[0] for x in all_series]
            )
            res = tmp["d"]
            for n in reversed(range(len(pairs))):
                if pairs[n][1] is None:
                    res = res.mask(tmp[f"w_{n}"], pd.NA)
                else:
                    res = res.mask(tmp[f"w_{n}"], tmp[f"t_{n}"])
            return res
        sd = {x[0]: x[1] for x in all_series}
        for n in range(len(pairs)):
            if sd[f"w_{n}"] == 1.0:
                return sd[f"t_{n}"]
        return sd["d"]

[docs]    def like(  # noqa: C901
        self, col: Any, expr: Any, ignore_case: bool = False, positive: bool = True
    ) -> Any:
        """SQL ``LIKE``

        :param col: a series or a constant
        :param expr: a pattern expression
        :param ignore_case: whether to ignore case, defaults to False
        :param positive: ``LIKE`` or ``NOT LIKE``, defaults to True
        :return: the correspondent boolean series or constant

        .. note:

        This behavior should be consistent with SQL ``LIKE``
        """
        assert_or_throw(
            expr is None or isinstance(expr, str),
            NotImplementedError("expr can only be a string"),
        )

        def like_series(col: TCol) -> TCol:
            le = LikeExpr(expr)
            if le.shortcut == LikeExprShortcut.EMPTY:
                return col == ""
            if le.shortcut == LikeExprShortcut.NOT_EMPTY:
                return col != ""
            if le.shortcut == LikeExprShortcut.SIMPLE:
                if not ignore_case:
                    return col == le.tokens[0][1]
                else:
                    return col.str.lower() == le.tokens[0][1].lower()
            if le.shortcut == LikeExprShortcut.ANY:
                return ~(col.isnull())
            if le.shortcut == LikeExprShortcut.START:
                if not ignore_case:
                    return col.str.startswith(le.tokens[0][1])
                return col.str.lower().str.startswith(le.tokens[0][1].lower())
            if le.shortcut == LikeExprShortcut.END:
                if not ignore_case:
                    return col.str.endswith(le.tokens[1][1]).mask(nulls, None)
                return col.str.lower().str.endswith(le.tokens[1][1].lower())
            if le.shortcut == LikeExprShortcut.START_END:
                if not ignore_case:
                    return col.str.startswith(le.tokens[0][1]) & col.str.endswith(
                        le.tokens[2][1]
                    )
                return col.str.lower().str.startswith(
                    le.tokens[0][1].lower()
                ) & col.str.lower().str.endswith(le.tokens[2][1].lower())
            if le.shortcut == LikeExprShortcut.CONTAIN:
                if not ignore_case:
                    return col.str.contains(le.tokens[1][1])
                return col.str.lower().str.contains(le.tokens[1][1].lower())
            if le.shortcut == LikeExprShortcut.NA:
                return col.str.match(le.re(), case=not ignore_case)
            raise NotImplementedError(le.shortcut)  # pragma: no cover

        if self.is_series(col):
            if expr is None:
                return self.to_constant_series(float("nan"), col)
            nulls = col.isnull()
            res = like_series(col)
            if positive:
                return res.mask(nulls, None)
            return (res == 0).mask(nulls, None)
        else:
            res = self.series_to_array(
                self.like(self.to_series([col]), expr=expr, ignore_case=ignore_case)
            )[0]
            return None if pd.isna(res) else bool(res)

[docs]    def cols_to_df(self, cols: List[Any], names: Optional[List[str]] = None) -> TDf:
        """Construct the dataframe from a list of columns (series)

        :param cols: the collection of series or constants, at least one value must
            be a series
        :param names: the correspondent column names, defaults to None

        :return: the dataframe

        .. note::

        If ``names`` is not provided, then every series in ``cols`` must be
        named. Otherise, ``names`` must align with ``cols``. But whether names
        have duplications or invalid chars will not be verified by this method
        """
        raise NotImplementedError  # pragma: no cover

[docs]    def empty(self, df: TDf) -> bool:
        """Check if the dataframe is empty

        :param df: pandas like dataframe
        :return: if it is empty
        """
        return len(df.index) == 0

[docs]    def as_arrow(self, df: TDf, schema: pa.Schema, type_safe: bool = True) -> pa.Table:
        """Convert the dataframe to pyarrow table

        :param df: pandas like dataframe
        :param schema: if specified, it will be used to construct pyarrow table,
          defaults to None
        :param type_safe: check for overflows or other unsafe conversions
        :return: pyarrow table
        """
        pdf = self.as_pandas(df)
        return pa.Table.from_pandas(
            pdf, schema=schema, preserve_index=False, safe=type_safe
        )

[docs]    def as_pandas(self, df: TDf) -> pd.DataFrame:
        """Convert the dataframe to pandas dataframe

        :return: the pandas dataframe
        """
        raise NotImplementedError  # pragma: no cover

[docs]    def as_array_iterable(
        self,
        df: TDf,
        schema: pa.Schema,
        columns: Optional[List[str]] = None,
        type_safe: bool = False,
    ) -> Iterable[List[Any]]:
        """Convert pandas like dataframe to iterable of rows in the format of list.

        :param df: pandas like dataframe
        :param schema: schema of the input
        :param columns: columns to output, None for all columns
        :param type_safe: whether to enforce the types in schema, if False, it will
            return the original values from the dataframe
        :return: iterable of rows, each row is a list

        .. note::

        If there are nested types in schema, the conversion can be slower
        """
        if self.empty(df):
            return
        if columns is not None:
            df = df[columns]
            schema = pa.schema([schema.field(n) for n in columns])
        if not type_safe:
            for arr in df.astype(object).itertuples(index=False, name=None):
                yield list(arr)
        elif all(not pa.types.is_nested(x) for x in schema.types):
            p = self.as_arrow(df, schema, type_safe)
            d = p.to_pydict()
            cols = [d[n] for n in schema.names]
            for arr in zip(*cols):
                yield list(arr)
        else:
            # If schema has nested types, the conversion will be much slower
            for arr in apply_schema(
                schema,
                df.itertuples(index=False, name=None),
                copy=True,
                deep=True,
                str_as_json=True,
            ):
                yield arr

[docs]    def as_array(
        self,
        df: TDf,
        schema,
        columns: Optional[List[str]] = None,
        type_safe: bool = False,
    ) -> List[List[Any]]:
        return list(
            self.as_array_iterable(
                df, schema=schema, columns=columns, type_safe=type_safe
            )
        )

[docs]    def to_schema(self, df: TDf) -> pa.Schema:
        """Extract pandas dataframe schema as pyarrow schema. This is a replacement
        of pyarrow.Schema.from_pandas, and it can correctly handle string type and
        empty dataframes

        :param df: pandas dataframe
        :raises ValueError: if pandas dataframe does not have named schema
        :return: pyarrow.Schema

        .. note::

        The dataframe must be either empty, or with type pd.RangeIndex, pd.Int64Index
        or pd.UInt64Index and without a name, otherwise, `ValueError` will raise.
        """
        self.ensure_compatible(df)
        assert_or_throw(
            df.columns.dtype == "object",
            ValueError("Dataframe must have named schema"),
        )

        def get_fields() -> Iterable[pa.Field]:
            for c in df.columns:
                tp = df[c].dtype
                if tp == np.dtype("object") or tp == np.dtype(str):
                    t = pa.string()
                else:
                    t = self.to_safe_pa_type(tp)
                    if pa.types.is_timestamp(t):
                        t = TRIAD_DEFAULT_TIMESTAMP
                yield pa.field(c, t)

        return pa.schema(list(get_fields()))

[docs]    def cast_df(  # noqa: C901
        self, df: TDf, schema: pa.Schema, input_schema: Optional[pa.Schema] = None
    ) -> TDf:
        """Cast a dataframe to comply with `schema`.

        :param df: pandas like dataframe
        :param schema: pyarrow schema to convert to
        :param input_schema: the known input pyarrow schema, defaults to None
        :return: converted dataframe

        .. note::

        ``input_schema`` is important because sometimes the column types can be
        different from expected. For example if a boolean series contains Nones,
        the dtype will be object, without a input type hint, the function can't
        do the conversion correctly.
        """
        if input_schema is None:
            cols = [self.cast(df[v.name], v.type) for v in schema]
        else:
            cols = [
                self.cast(df[v.name], v.type, input_type=i.type)
                for v, i in zip(schema, input_schema)
            ]
        return self.cols_to_df(cols)

[docs]    def sql_groupby_apply(
        self,
        df: TDf,
        cols: List[str],
        func: Callable[[TDf], TDf],
        output_schema: Optional[pa.Schema] = None,
        **kwargs: Any,
    ) -> TDf:
        """Safe groupby apply operation on pandas like dataframes.
        In pandas like groupby apply, if any key is null, the whole group is dropped.
        This method makes sure those groups are included.

        :param df: pandas like dataframe
        :param cols: columns to group on, can be empty
        :param func: apply function, df in, df out
        :param output_schema: output schema hint for the apply
        :return: output dataframe

        .. note::

        The dataframe must be either empty, or with type pd.RangeIndex, pd.Int64Index
        or pd.UInt64Index and without a name, otherwise, `ValueError` will raise.
        """
        raise NotImplementedError  # pragma: no cover

[docs]    def is_compatile_index(self, df: TDf) -> bool:
        """Check whether the datafame is compatible with the operations inside
        this utils collection

        :param df: pandas like dataframe
        :return: if it is compatible
        """
        return isinstance(df.index, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index))

[docs]    def ensure_compatible(self, df: TDf) -> None:
        """Check whether the datafame is compatible with the operations inside
        this utils collection, if not, it will raise ValueError

        :param df: pandas like dataframe
        :raises ValueError: if not compatible
        """
        if df.index.name is not None:
            raise SlideIndexIncompatibleError(
                "pandas like datafame index can't have name"
            )
        if self.is_compatile_index(df):
            return
        if self.empty(df):
            return
        raise SlideIndexIncompatibleError(
            f"pandas like datafame must have default index, but got {type(df.index)}"
        )

[docs]    def drop_duplicates(self, df: TDf) -> TDf:
        """Select distinct rows from dataframe

            raise SlideIndexIncompatibleError(
                "pandas like datafame index can't have name"
            )
        :return: the result with only distinct rows
        """
        return df.drop_duplicates(ignore_index=True)

[docs]    def union(self, df1: TDf, df2: TDf, unique: bool) -> TDf:
        """Union two dataframes

        :param df1: the first dataframe
        :param df2: the second dataframe
        :param unique: whether return only unique rows
        :return: unioned dataframe
        """
        ndf1, ndf2 = self._preprocess_set_op(df1, df2)
        ndf = ndf1.append(ndf2)
        if unique:
            ndf = self.drop_duplicates(ndf)
        return ndf

[docs]    def intersect(self, df1: TDf, df2: TDf, unique: bool) -> TDf:
        """Intersect two dataframes

        :param ndf1: the first dataframe
        :param ndf2: the second dataframe
        :param unique: whether return only unique rows
        :return: intersected dataframe
        """
        ndf1, ndf2 = self._preprocess_set_op(df1, df2)
        ndf = ndf1.merge(self.drop_duplicates(ndf2))
        if unique:
            ndf = self.drop_duplicates(ndf)
        return ndf

[docs]    def except_df(
        self,
        df1: TDf,
        df2: TDf,
        unique: bool,
        anti_indicator_col: str = _ANTI_INDICATOR,
    ) -> TDf:
        """Exclude df2 from df1

        :param df1: the first dataframe
        :param df2: the second dataframe
        :param unique: whether return only unique rows
        :return: df1 - df2

        .. note:

        The behavior is not well defined when unique is False
        """
        ndf1, ndf2 = self._preprocess_set_op(df1, df2)
        ndf2 = self._with_indicator(self.drop_duplicates(ndf2), anti_indicator_col)
        ndf = ndf1.merge(ndf2, how="left", on=list(ndf1.columns))
        ndf = ndf[ndf[anti_indicator_col].isnull()].drop([anti_indicator_col], axis=1)
        if unique:
            ndf = self.drop_duplicates(ndf)
        return ndf

[docs]    def join(
        self,
        ndf1: TDf,
        ndf2: TDf,
        join_type: str,
        on: List[str],
        anti_indicator_col: str = _ANTI_INDICATOR,
        cross_indicator_col: str = _CROSS_INDICATOR,
    ) -> TDf:
        """Join two dataframes.

        :param ndf1: the first dataframe
        :param ndf2: the second dataframe
        :param join_type: see :func:`~.parse_join_type`
        :param on: join keys for pandas like ``merge`` to use
        :param anti_indicator_col: temporary column name for anti join,
            defaults to _ANTI_INDICATOR
        :param cross_indicator_col: temporary column name for cross join,
            defaults to _CROSS_INDICATOR
        :raises NotImplementedError: if join type is not supported
        :return: the joined dataframe

        .. note:

        All join behaviors should be consistent with SQL correspondent joins.
        """
        join_type = parse_join_type(join_type)
        if join_type == "inner":
            ndf1 = ndf1.dropna(subset=on)
            ndf2 = ndf2.dropna(subset=on)
            joined = ndf1.merge(ndf2, how=join_type, on=on)
        elif join_type == "left_semi":
            ndf1 = ndf1.dropna(subset=on)
            ndf2 = self.drop_duplicates(ndf2[on].dropna())
            joined = ndf1.merge(ndf2, how="inner", on=on)
        elif join_type == "left_anti":
            # TODO: lack of test to make sure original ndf2 is not changed
            ndf2 = self.drop_duplicates(ndf2[on].dropna())
            ndf2 = self._with_indicator(ndf2, anti_indicator_col)
            joined = ndf1.merge(ndf2, how="left", on=on)
            joined = joined[joined[anti_indicator_col].isnull()].drop(
                [anti_indicator_col], axis=1
            )
        elif join_type == "left_outer":
            ndf2 = ndf2.dropna(subset=on)
            joined = ndf1.merge(ndf2, how="left", on=on)
        elif join_type == "right_outer":
            ndf1 = ndf1.dropna(subset=on)
            joined = ndf1.merge(ndf2, how="right", on=on)
        elif join_type == "full_outer":
            add: List[str] = []
            add_df1: Dict[str, TCol] = {}
            add_df2: Dict[str, TCol] = {}
            for f in on:
                name = f + "_null"
                s1 = ndf1[f].isnull().astype(int)
                add_df1[name] = s1
                s2 = ndf2[f].isnull().astype(int) * 2
                add_df2[name] = s2
                add.append(name)
            joined = (
                ndf1.assign(**add_df1)
                .merge(ndf2.assign(**add_df2), how="outer", on=add + on)
                .drop(add, axis=1)
            )
        elif join_type == "cross":
            assert_or_throw(
                len(on) == 0, ValueError(f"cross join can't have join keys {on}")
            )
            ndf1 = self._with_indicator(ndf1, cross_indicator_col)
            ndf2 = self._with_indicator(ndf2, cross_indicator_col)
            joined = ndf1.merge(ndf2, how="inner", on=[cross_indicator_col]).drop(
                [cross_indicator_col], axis=1
            )
        else:  # pragma: no cover
            raise NotImplementedError(join_type)
        return joined

    def _set_op_result_to_none(self, series: Any, s1: Any, s2: Any) -> Any:
        if not self.is_series(series):
            return series
        if self.is_series(s1):
            series = series.mask(s1.isnull(), None)
        if self.is_series(s2):
            series = series.mask(s2.isnull(), None)
        return series

    def _safe_bool(self, col: Any) -> Any:
        if self.is_series(col):
            return col.astype("boolean")
        if col is None:
            return None
        return col != 0

    def _preprocess_set_op(self, ndf1: TDf, ndf2: TDf) -> Tuple[TDf, TDf]:
        assert_or_throw(
            len(list(ndf1.columns)) == len(list(ndf2.columns)),
            ValueError("two dataframes have different number of columns"),
        )
        cols: List[TCol] = []
        same = True
        for c1, c2 in zip(ndf1.columns, ndf2.columns):
            same &= c1 == c2
            cols.append(ndf2[c2])
        if same:
            return ndf1, ndf2
        return ndf1, self.cols_to_df(cols, list(ndf1.columns))

    def _with_indicator(self, df: TDf, name: str) -> TDf:
        return df.assign(**{name: 1})

    def _cast_to_bool(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        if pa.types.is_boolean(from_type):
            if (
                pa.types.is_integer(inf_type)
                or pa.types.is_floating(inf_type)
                or pa.types.is_string(inf_type)  # bool/int with nulls
            ):
                nulls = col.isnull()
                return (col != 0).mask(nulls, pd.NA).astype(safe_dtype)
        elif pa.types.is_integer(from_type) or pa.types.is_floating(from_type):
            nulls = col.isnull()
            return (col != 0).mask(nulls, pd.NA).astype(safe_dtype)
        elif pa.types.is_string(from_type):
            lower = col.str.lower()
            res = lower.isin(["true", "1", "1.0"])
            nulls = (~res) & (~lower.isin(["false", "0", "0.0"]))
            return res.mask(nulls, pd.NA).astype(safe_dtype)
        raise SlideCastError(f"unable to cast from {from_type} to {safe_dtype}")

    def _cast_to_int(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        def _convert_int_like() -> TCol:
            nulls = col.isnull()
            tp = to_single_pandas_dtype(
                self.to_safe_pa_type(safe_dtype), use_extension_types=False
            )
            return col.fillna(0).astype(tp).astype(safe_dtype).mask(nulls, pd.NA)

        if pa.types.is_boolean(from_type):
            if pa.types.is_string(inf_type):  # bool with nulls
                return _convert_int_like()
            return col.astype(safe_dtype)
        elif pa.types.is_integer(from_type):
            if pa.types.is_string(inf_type):  # pragma: no cover
                # int with nulls
                return _convert_int_like()
            return col.astype(safe_dtype)
        elif pa.types.is_floating(from_type):
            nulls = col.isnull()
            tp = to_single_pandas_dtype(
                self.to_safe_pa_type(safe_dtype), use_extension_types=False
            )
            return col.fillna(0).astype(tp).astype(safe_dtype).mask(nulls, pd.NA)
        elif pa.types.is_string(from_type):  # integer string representations
            # SQL can convert '1.1' to 1 as an integer
            temp = self._cast_to_float(
                col, from_type=from_type, inf_type=inf_type, safe_dtype=np.float64
            )
            nulls = temp.isnull()
            tp = to_single_pandas_dtype(
                self.to_safe_pa_type(safe_dtype), use_extension_types=False
            )
            return temp.fillna(0).astype(tp).astype(safe_dtype).mask(nulls, pd.NA)
        raise SlideCastError(f"unable to cast from {from_type} to {safe_dtype}")

    def _cast_to_float(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        return col.astype(safe_dtype)

    def _cast_to_str(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        nulls = col.isnull()
        if pa.types.is_boolean(from_type):
            if pa.types.is_boolean(inf_type):
                return col.astype(safe_dtype).str.lower().mask(nulls, pd.NA)
            if (
                pa.types.is_integer(inf_type)
                or pa.types.is_floating(inf_type)
                or pa.types.is_string(inf_type)  # bool with nulls
            ):
                return (
                    (col != 0)
                    .astype("boolean")
                    .astype(safe_dtype)
                    .str.lower()
                    .mask(nulls, pd.NA)
                )
            else:  # pragma: no cover
                raise SlideCastError(
                    f"underlying data type {inf_type} is impossible to be boolean"
                )
        if pa.types.is_integer(from_type) and inf_type != from_type:
            return (
                col.fillna(0)
                .astype(to_single_pandas_dtype(from_type, use_extension_types=False))
                .astype(safe_dtype)
                .mask(nulls, pd.NA)
            )
        return col.astype(safe_dtype).mask(nulls, pd.NA)

    def _cast_to_datetime(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        return col.astype(safe_dtype)

    def _cast_to_date(
        self,
        col: TCol,
        from_type: pa.DataType,
        inf_type: pa.DataType,
        safe_dtype: np.dtype,
    ) -> TCol:
        if pd.__version__ < "1.2":  # pragma: no cover
            return col.astype(safe_dtype).dt.floor("D")
        return col.astype(safe_dtype).dt.date