"""
Data-frame typing / schema documentation module
This module provides helpers to read/process/write `pandas <https://pandas.pydata.org/>`_
:class:`~pandas.DataFrame` instances with simple validation.
.. rubric:: Types
.. py:data:: _RowSpec
Dataclass specifying a dataframe row
"""
import collections
__version__ = "0.3.1"
import os
from dataclasses import asdict, dataclass, fields, is_dataclass
from typing import (
Any,
Callable,
Dict,
Generic,
Iterable,
Literal,
Mapping,
NoReturn,
Sequence,
Set,
SupportsIndex,
TextIO,
Type,
TypeVar,
Union,
overload,
)
import numpy as np
import pandas as pd
from typing_extensions import ParamSpec, get_type_hints
def _assert_never(x: NoReturn) -> NoReturn:
raise AssertionError(f"Invalid value: {x!r}")
__all__ = ["Schema", "Tyble", "schema", "tyble"]
_RowSpec = TypeVar("_RowSpec")
[docs]@dataclass(frozen=True)
class Schema(Generic[_RowSpec]):
"""
Describes the structure of a Pandas dataframe
In Tybles, a schema is derived from a dataclass describing one row of the dataframe.
"""
#: Row specification
row_spec: Type[_RowSpec]
#: Whether to order columns in the dataframe as in the row specification
order_columns: bool
#: What to do with missing columns
#:
#: This occurs when reading/creating a dataframe *and* when writing/exporting a dataframe.
#:
#: The possible values are:
#:
#: - "error": raise an error (default)
#: - "missing": leave the missing columns missing (set ``validate`` to False then)
#: - "fill": fill the columns with the dtype default value
missing_columns: Union[Literal["error"], Literal["missing"], Literal["fill"]]
#: What to do with extra columns present, that are not part of the row specification
#:
#: - "drop": remove the extra columns from the dataframe (default)
#: - "keep": keep the extra columns in the dataframe (note that the dtype is autodetected)
#: - "error": raise an error
extra_columns: Union[Literal["drop"], Literal["keep"], Literal["error"]]
#: Whether to run validation on every row of the data
#:
#: If the `typeguard <https://github.com/agronholm/typeguard>`_ library is present, this will
#: use :func:`typeguard.check_type`, otherwise a simple :func:`isinstance` check will be done.
validate: bool
#: Names of the fields in the schema, in order of definition
field_names: Sequence[str]
#: Mapping of field names with associated dtypes
#:
#: Can also serve as a ``dtype=`` argument for various Pandas functions
dtypes: Mapping[str, np.dtype]
#: Mapping of field names with associated annotated types
annotated_types: Mapping[str, type]
[docs] def validate_row(self, row: _RowSpec) -> None:
"""
Validates the given row and raises an exception if validation fails
Args:
row: Row to validate
Raises:
TypeError: If typeguard or standard validation failed
BeartypeException: If beartype failed
"""
checked = False
try:
from typeguard import check_type
check_type("Row in dataframe", row, self.row_spec)
checked = True
except ImportError:
pass
try:
from beartype.abby import die_if_unbearable
for name, typ in self.annotated_types.items():
die_if_unbearable(getattr(row, name), typ)
checked = True
except ImportError:
pass
if not checked:
for name, typ in self.annotated_types.items():
value = getattr(row, name)
if not isinstance(value, typ):
raise TypeError(
f"Field {name} with value {value} does not conform to type {typ}"
)
@overload
def from_rows(
self, rows: Sequence[_RowSpec], return_type: Literal["DataFrame"] = "DataFrame", **kwargs
) -> pd.DataFrame:
pass
@overload
def from_rows(
self, rows: Sequence[_RowSpec], return_type: Literal["Tyble"], **kwargs
) -> pd.DataFrame:
pass
[docs] def from_rows(
self,
rows: Sequence[_RowSpec],
return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame",
**kwargs,
) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]:
"""
Returns a pandas DataFrame (possibly as an enriched Tyble) from row instances
Args:
rows: Rows as a sequence of dataclass instances
Keyword Args:
return_type: Whether to return a pandas :class:`~pandas.DataFrame` (default)
or a :class:`.Tyble` instance
kwargs: Extra keyword arguments are passed to :meth:`pandas.DataFrame.from_records`
Returns:
A pandas DataFrame, possibly wrapped in a Tyble
"""
df = pd.DataFrame.from_records([asdict(row) for row in rows])
return self.process_raw_data_frame(df, return_type)
@overload
def read_csv(
self,
filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike],
return_type: Literal["DataFrame"] = "DataFrame",
**kw_args,
) -> pd.DataFrame:
pass
@overload
def read_csv(
self,
filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike],
return_type: Literal["Tyble"],
**kw_args,
) -> "Tyble[_RowSpec]":
pass
[docs] def read_csv(
self,
filepath_or_buffer: Union[TextIO, str, bytes, os.PathLike],
return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame",
**kw_args,
) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]:
"""
Reads a pandas DataFrame from a CSV file, shaping up and validating the data on demand
Args:
filepath_or_buffer: Path or open file to read from
Keyword Args:
return_type: Whether to return a pandas :class:`~pandas.DataFrame` (default)
or a :class:`.Tyble` instance
kw_args: Additional keyword arguments not listed above are passed to :func:`pandas.read_csv`
Returns:
A pandas dataframe, possibly wrapped in a Tyble
"""
return self.process_raw_data_frame(
pd.read_csv(
filepath_or_buffer,
dtype=self.dtypes,
**kw_args,
),
return_type,
)
@overload
def process_raw_data_frame(
self, df: pd.DataFrame, return_type: Literal["DataFrame"] = "DataFrame"
) -> pd.DataFrame:
pass
@overload
def process_raw_data_frame(
self, df: pd.DataFrame, return_type: Literal["Tyble"]
) -> "Tyble[_RowSpec]":
pass
[docs] def process_raw_data_frame(
self,
df: pd.DataFrame,
return_type: Union[Literal["DataFrame"], Literal["Tyble"]] = "DataFrame",
) -> Union[pd.DataFrame, "Tyble[_RowSpec]"]:
"""
Args:
df: Dataframe to process, will be mutated.
In any case, one should use the dataframe returned by this function.
(The code may or may not mutate in place this given dataframe.)
Raises:
ValueError: If the dataframe fails the ``missing_columns`` or ``extra_columns`` checks
TypeError: If typeguard validation failed
BeartypeException: If beartype failed
Returns:
The processed dataframe or a dataframe wrapped in a :class:`.Tyble` instance
"""
if self.missing_columns != "missing":
missing: Set[str] = set(self.field_names).difference(df.columns)
if missing:
if self.missing_columns == "error":
raise ValueError("Missing columns in CSV file: " + ", ".join(missing))
if self.missing_columns == "fill":
for name, dt in self.dtypes.items():
if name in missing:
df[name] = pd.Series(np.zeros((len(df),), dtype=dt), index=df.index)
if self.extra_columns != "keep":
extra: Set[str] = set(df.columns).difference(self.field_names)
if extra:
if self.extra_columns == "error":
raise ValueError("Extra columns in CSV file: " + ", ".join(extra))
if self.extra_columns == "drop":
df.drop(list(extra), axis="columns", inplace=True)
if self.order_columns:
in_spec: Sequence[str] = [n for n in self.field_names if n in df.columns]
extra1: Sequence[str] = [n for n in df.columns if n not in self.field_names]
df = df.loc[:, [*in_spec, *extra1]]
if self.validate:
for row in Tyble(df, self):
self.validate_row(row)
if return_type == "DataFrame":
return df
elif return_type == "Tyble":
return Tyble(df, self)
else:
_assert_never(return_type)
[docs]def schema(
row_spec: Type[_RowSpec],
*,
order_columns: bool = True,
missing_columns: Union[Literal["error"], Literal["missing"], Literal["fill"]] = "error",
extra_columns: Union[Literal["drop"], Literal["keep"], Literal["error"]] = "drop",
validate: bool = True,
) -> Schema[_RowSpec]:
"""
Creates a dataframe schema from a row specification dataclass
For detailed description of the keyword arguments, look up the :class:`.Schema` attributes
documentation.
Args:
row_spec: Data class specifying a dataframe row
Keyword Args:
order_columns: Whether to order the dataframe columns as in the specification
missing_columns: What to do with missing columns
extra_columns: What to do with extra columns
validate: Whether to perform validation
Returns:
A dataframe schema
"""
assert is_dataclass(row_spec), "The row specification must be a dataclass"
assert hasattr(row_spec, "__annotations__")
if missing_columns == "missing":
assert not validate, "When missing_columns = missing, validate should be False"
dtypes = {n: np.dtype(t) for n, t in get_type_hints(row_spec, include_extras=False).items()}
return Schema(
row_spec,
dtypes=dtypes,
annotated_types=get_type_hints(row_spec, include_extras=True),
field_names=[f.name for f in fields(row_spec)],
order_columns=order_columns,
missing_columns=missing_columns,
extra_columns=extra_columns,
validate=validate,
)
[docs]@dataclass(frozen=True)
class Tyble(Sequence[_RowSpec], Generic[_RowSpec]):
"""
Describes a Pandas dataframe enriched with a schema
"""
data_frame: pd.DataFrame
schema: Schema[_RowSpec]
def __len__(self) -> int:
return len(self.data_frame)
[docs] def to_rows(self) -> Sequence[_RowSpec]:
return self[:]
@overload
def __getitem__(self, index: int) -> _RowSpec:
pass
@overload
def __getitem__(self, index: slice) -> Sequence[_RowSpec]:
pass
@overload
def __getitem__(self, index: Sequence[int]) -> Sequence[_RowSpec]:
pass
@overload
def __getitem__(self, index: SupportsIndex) -> _RowSpec:
pass
@overload
def __getitem__(self, index: Iterable[SupportsIndex]) -> Sequence[_RowSpec]:
pass
def __getitem__(self, index: Any) -> Union[_RowSpec, Sequence[_RowSpec]]:
if isinstance(index, SupportsIndex):
row = self.data_frame.iloc[index.__index__()]
content = {name: row[name] for name in self.schema.dtypes.keys()}
return self.schema.row_spec(**content)
elif isinstance(index, Iterable):
return [self[i] for i in index]
elif isinstance(index, slice):
return self[range(*index.indices(len(self.data_frame)))]
else:
raise ValueError("Argument must be either an index or a sequence of indices")
def __repr__(self) -> str:
tn = self.schema.row_spec.__name__
if len(self) == 0:
return f"Empty Tyble for row_spec={tn}"
else:
return "\n".join(
[
f"Tyble: self.schema.row_spec={tn}",
f" self[0]={self[0]}",
" self.data_frame=",
*self.data_frame.__repr__().split("\n"),
]
)
[docs]def tyble(data_frame: pd.DataFrame, schema: Schema[_RowSpec]) -> Tyble[_RowSpec]:
return Tyble(data_frame, schema)