Source code for tybles

"""
Data-frame typing / schema documentation module

This module provides helpers to read/process/write `pandas <https://pandas.pydata.org/>`_
:class:`~pandas.DataFrame` instances with simple validation.

.. rubric:: Types

.. py:data:: _RowSpec

    Dataclass specifying a dataframe row
"""

import collections

__version__ = "0.3.1"
import os
from dataclasses import asdict, dataclass, fields, is_dataclass
from typing import (
    Any,
    Callable,
    Dict,
    Generic,
    Iterable,
    Literal,
    Mapping,
    NoReturn,
    Sequence,
    Set,
    SupportsIndex,
    TextIO,
    Type,
    TypeVar,
    Union,
    overload,
)

import numpy as np
import pandas as pd
from typing_extensions import ParamSpec, get_type_hints


def _assert_never(x: NoReturn) -> NoReturn:
    raise AssertionError(f"Invalid value: {x!r}")


__all__ = ["Schema", "Tyble", "schema", "tyble"]

_RowSpec = TypeVar("_RowSpec")


[docs]@dataclass(frozen=True) class Schema(Generic[_RowSpec]): """ Describes the structure of a Pandas dataframe In Tybles, a schema is derived from a dataclass describing one row of the dataframe. """ #: Row specification row_spec: Type[_RowSpec] #: Whether to order columns in the dataframe as in the row specification order_columns: bool #: What to do with missing columns #: #: This occurs when reading/creating a dataframe *and* when writing/exporting a dataframe. #: #: The possible values are: #: #: - "error": raise an error (default) #: - "missing": leave the missing columns missing (set ``validate`` to False then) #: - "fill": fill the columns with the dtype default value missing_columns: Union[Literal["error"], Literal["missing"], Literal["fill"]] #: What to do with extra columns present, that are not part of the row specification #: #: - "drop": remove the extra columns from the dataframe (default) #: - "keep": keep the extra columns in the dataframe (note that the dtype is autodetected) #: - "error": raise an error extra_columns: Union[Literal["drop"], Literal["keep"], Literal["error"]] #: Whether to run validation on every row of the data #: #: If the `typeguard <https://github.com/agronholm/typeguard>`_ library is present, this will #: use :func:`typeguard.check_type`, otherwise a simple :func:`isinstance` check will be done. validate: bool #: Names of the fields in the schema, in order of definition field_names: Sequence[str] #: Mapping of field names with associated dtypes #: #: Can also serve as a ``dtype=`` argument for various Pandas functions dtypes: Mapping[str, np.dtype] #: Mapping of field names with associated annotated types annotated_types: Mapping[str, type]
[docs] def validate_row(self, row: _RowSpec) -> None: """ Validates the given row and raises an exception if validation fails Args: row: Row to validate Raises: TypeError: If typeguard or standard validation failed BeartypeException: If beartype failed """ checked = False try: from typeguard import check_type check_type("Row in dataframe", row, self.row_spec) checked = True except ImportError: pass try: from beartype.abby import die_if_unbearable for name, typ in self.annotated_types.items(): die_if_unbearable(getattr(row, name), typ) checked = True except ImportError: pass if not checked: for name, typ in self.annotated_types.items(): value = getattr(row, name) if not isinstance(value, typ): raise TypeError( f"Field {name} with value {value} does not conform to type {typ}" )
@overload def from_rows( self, rows: Sequence[_RowSpec], return_type: Literal["DataFrame"] = "DataFrame", **kwargs ) -> pd.DataFrame: pass @overload def from_rows( self, rows: Sequence[_RowSpec], return_type: Literal["Tyble"], **kwargs ) -> pd.DataFrame: pass
[docs] def from_rows( self, rows: Sequence[_RowSpec], return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame", **kwargs, ) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]: """ Returns a pandas DataFrame (possibly as an enriched Tyble) from row instances Args: rows: Rows as a sequence of dataclass instances Keyword Args: return_type: Whether to return a pandas :class:`~pandas.DataFrame` (default) or a :class:`.Tyble` instance kwargs: Extra keyword arguments are passed to :meth:`pandas.DataFrame.from_records` Returns: A pandas DataFrame, possibly wrapped in a Tyble """ df = pd.DataFrame.from_records([asdict(row) for row in rows]) return self.process_raw_data_frame(df, return_type)
@overload def read_csv( self, filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike], return_type: Literal["DataFrame"] = "DataFrame", **kw_args, ) -> pd.DataFrame: pass @overload def read_csv( self, filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike], return_type: Literal["Tyble"], **kw_args, ) -> "Tyble[_RowSpec]": pass
[docs] def read_csv( self, filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike], return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame", **kw_args, ) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]: """ Reads a pandas DataFrame from a CSV file, shaping up and validating the data on demand Args: filepath_or_buffer: Path or open file to read from Keyword Args: return_type: Whether to return a pandas :class:`~pandas.DataFrame` (default) or a :class:`.Tyble` instance kw_args: Additional keyword arguments not listed above are passed to :func:`pandas.read_csv` Returns: A pandas dataframe, possibly wrapped in a Tyble """ return self.process_raw_data_frame( pd.read_csv( filepath_or_buffer, dtype=self.dtypes, **kw_args, ), return_type, )
@overload def process_raw_data_frame( self, df: pd.DataFrame, return_type: Literal["DataFrame"] = "DataFrame" ) -> pd.DataFrame: pass @overload def process_raw_data_frame( self, df: pd.DataFrame, return_type: Literal["Tyble"] ) -> "Tyble[_RowSpec]": pass
[docs] def process_raw_data_frame( self, df: pd.DataFrame, return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame", ) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]: """ Args: df: Dataframe to process, will be mutated. In any case, one should use the dataframe returned by this function. (The code may or may not mutate in place this given dataframe.) Raises: ValueError: If the dataframe fails the ``missing_columns`` or ``extra_columns`` checks TypeError: If typeguard validation failed BeartypeException: If beartype failed Returns: The processed dataframe or a dataframe wrapped in a :class:`.Tyble` instance """ if self.missing_columns != "missing": missing: Set[str] = set(self.field_names).difference(df.columns) if missing: if self.missing_columns == "error": raise ValueError("Missing columns in CSV file: " + ", ".join(missing)) if self.missing_columns == "fill": for name, dt in self.dtypes.items(): if name in missing: df[name] = pd.Series(np.zeros((len(df),), dtype=dt), index=df.index) if self.extra_columns != "keep": extra: Set[str] = set(df.columns).difference(self.field_names) if extra: if self.extra_columns == "error": raise ValueError("Extra columns in CSV file: " + ", ".join(extra)) if self.extra_columns == "drop": df.drop(list(extra), axis="columns", inplace=True) if self.order_columns: in_spec: Sequence[str] = [n for n in self.field_names if n in df.columns] extra1: Sequence[str] = [n for n in df.columns if n not in self.field_names] df = df.loc[:, [*in_spec, *extra1]] if self.validate: for row in Tyble(df, self): self.validate_row(row) if return_type == "DataFrame": return df elif return_type == "Tyble": return Tyble(df, self) else: _assert_never(return_type)
[docs]def schema( row_spec: Type[_RowSpec], *, order_columns: bool = True, missing_columns: Union[Literal["error"], Literal["missing"], Literal["fill"]] = "error", extra_columns: Union[Literal["drop"], Literal["keep"], Literal["error"]] = "drop", validate: bool = True, ) -> Schema[_RowSpec]: """ Creates a dataframe schema from a row specification dataclass For detailed description of the keyword arguments, look up the :class:`.Schema` attributes documentation. Args: row_spec: Data class specifying a dataframe row Keyword Args: order_columns: Whether to order the dataframe columns as in the specification missing_columns: What to do with missing columns extra_columns: What to do with extra columns validate: Whether to perform validation Returns: A dataframe schema """ assert is_dataclass(row_spec), "The row specification must be a dataclass" assert hasattr(row_spec, "__annotations__") if missing_columns == "missing": assert not validate, "When missing_columns = missing, validate should be False" dtypes = {n: np.dtype(t) for n, t in get_type_hints(row_spec, include_extras=False).items()} return Schema( row_spec, dtypes=dtypes, annotated_types=get_type_hints(row_spec, include_extras=True), field_names=[f.name for f in fields(row_spec)], order_columns=order_columns, missing_columns=missing_columns, extra_columns=extra_columns, validate=validate, )
[docs]@dataclass(frozen=True) class Tyble(Sequence[_RowSpec], Generic[_RowSpec]): """ Describes a Pandas dataframe enriched with a schema """ data_frame: pd.DataFrame schema: Schema[_RowSpec] def __len__(self) -> int: return len(self.data_frame)
[docs] def to_rows(self) -> Sequence[_RowSpec]: return self[:]
@overload def __getitem__(self, index: int) -> _RowSpec: pass @overload def __getitem__(self, index: slice) -> Sequence[_RowSpec]: pass @overload def __getitem__(self, index: Sequence[int]) -> Sequence[_RowSpec]: pass @overload def __getitem__(self, index: SupportsIndex) -> _RowSpec: pass @overload def __getitem__(self, index: Iterable[SupportsIndex]) -> Sequence[_RowSpec]: pass def __getitem__(self, index: Any) -> Union[_RowSpec, Sequence[_RowSpec]]: if isinstance(index, SupportsIndex): row = self.data_frame.iloc[index.__index__()] content = {name: row[name] for name in self.schema.dtypes.keys()} return self.schema.row_spec(**content) elif isinstance(index, Iterable): return [self[i] for i in index] elif isinstance(index, slice): return self[range(*index.indices(len(self.data_frame)))] else: raise ValueError("Argument must be either an index or a sequence of indices") def __repr__(self) -> str: tn = self.schema.row_spec.__name__ if len(self) == 0: return f"Empty Tyble for row_spec={tn}" else: return "\n".join( [ f"Tyble: self.schema.row_spec={tn}", f" self[0]={self[0]}", " self.data_frame=", *self.data_frame.__repr__().split("\n"), ] )
[docs]def tyble(data_frame: pd.DataFrame, schema: Schema[_RowSpec]) -> Tyble[_RowSpec]: return Tyble(data_frame, schema)