Source code for exatomic.exa.core.numerical

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Data Objects
###################################
Data objects are used to store typed data coming from an external source (for
example a file on disk). There are three primary data objects provided by
this module, :class:`~exatomic.exa.core.numerical.Series`, :class:`~exatomic.exa.core.numerical.DataFrame`,
and :class:`~exatomic.exa.core.numerical.Field`. The purpose of these objects is to facilitate
conversion of data into "traits" used in visualization and enforce relationships
between data objects in a given container. Any of the objects provided by this
module may be extended.
"""
import logging
import warnings
import numpy as np
import pandas as pd
from exatomic.exa.core.error import RequiredColumnError


[docs]class Numerical(object):
    """
    Base class for :class:`~exatomic.exa.core.numerical.Series`,
    :class:`~exatomic.exa.core.numerical.DataFrame`, and :class:`~exatomic.exa.numerical.Field`
    objects, providing default trait functionality and clean representations
    when present as part of containers.
    """
    @property
    def log(self):
        name = '.'.join([self.__module__, self.__class__.__name__])
        return logging.getLogger(name)

[docs]    def slice_naive(self, key):
        """
        Slice a data object based on its index, either by value (.loc) or
        position (.iloc).

        Args:
            key: Single index value, slice, tuple, or list of indices/positionals

        Returns:
            data: Slice of self
        """
        cls = self.__class__
        key = check_key(self, key)
        return cls(self.loc[key])

    def __str__(self):
        return self.__repr__()


[docs]class BaseSeries(Numerical):
    """
    Base class for dense and sparse series objects (labeled arrays).

    Attributes:
        _sname (str): May have a required name (default None)
        _iname (str): May have a required index name
        _stype (type): May have a required value type
        _itype (type): May have a required index type
    """
    _metadata = ['name', 'meta']
    # These attributes may be set when subclassing Series
    _sname = None           # Series may have a required name
    _iname = None           # Series may have a required index name
    _stype = None           # Series may have a required value type
    _itype = None           # Series may have a required index type

    def __init__(self, *args, **kwargs):
        meta = kwargs.pop('meta', None)
        super(BaseSeries, self).__init__(*args, **kwargs)
        if hasattr(self, "name") and hasattr(self, "_sname") and hasattr(self, "_iname"):
            if self._sname is not None and self.name != self._sname:
                if self.name is not None:
                    warnings.warn("Object's name changed")
                self.name = self._sname
            if self._iname is not None and self.index.name != self._iname:
                if self.index.name is not None:
                    warnings.warn("Object's index name changed")
                self.index.name = self._iname
        self.meta = meta


[docs]class BaseDataFrame(Numerical):
    """
    Base class for dense and sparse dataframe objects (labeled matrices).

    Note:
        If the _cardinal attribute is populated, it will automatically be added
        to the _categories and _columns attributes.

    Attributes:
        _cardinal (tuple): Tuple of column name and raw type that acts as foreign key to index of another table
        _index (str): Name of index (may be used as foreign key in another table)
        _columns (list): Required columns
        _categories (dict): Dict of column names, raw types that if present will be converted to and from categoricals automatically
    """
    _metadata = ['name', 'meta']
    _cardinal = None     # Tuple of column name and raw type that acts as foreign key to index of another table
    _index = None      # Name of index (may be used as foreign key in another table)
    _columns = []      # Required columns
    _categories = {}   # Dict of column names, raw types that if present will be converted to and from categoricals automatically

[docs]    def cardinal_groupby(self):
        """
        Group this object on it cardinal dimension (_cardinal).

        Returns:
            grpby: Pandas groupby object (grouped on _cardinal)
        """
        g, t = self._cardinal
        self[g] = self[g].astype(t)
        grpby = self.groupby(g)
        self[g] = self[g].astype('category')
        return grpby

[docs]    def slice_cardinal(self, key):
        """
        Get the slice of this object by the value or values of the cardinal
        dimension.
        """
        cls = self.__class__
        key = check_key(self, key, cardinal=True)
        return cls(self[self[self._cardinal[0]].isin(key)])

    def __init__(self, *args, **kwargs):
        meta = kwargs.pop('meta', None)
        super(BaseDataFrame, self).__init__(*args, **kwargs)
        self.meta = meta


[docs]class Series(BaseSeries, pd.Series):
    """
    A labeled array.

    .. code-block:: Python

        class MySeries(exatomic.exa.core.numerical.Series):
            _sname = 'data'        # series default name
            _iname = 'data_index'  # series default index name

        seri = MySeries(np.random.rand(10**5))
    """
    @property
    def _constructor(self):
        return Series

[docs]    def copy(self, *args, **kwargs):
        """
        Make a copy of this object.

        See Also:
            For arguments and description of behavior see `pandas docs`_.

        .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html
        """
        cls = self.__class__    # Note that type conversion does not perform copy
        return cls(pd.Series(self).copy(*args, **kwargs))


[docs]class DataFrame(BaseDataFrame, pd.DataFrame):
    """
    A data table

    .. code-block:: Python

        class MyDF(exatomic.exa.core.numerical.DataFrame):
            _cardinal = ('cardinal', int)
            _index = 'mydf_index'
            _columns = ['x', 'y', 'z', 'symbol']
            _categories = {'symbol': str}
    """
    _constructor_sliced = Series

    @property
    def _constructor(self):
        return DataFrame

[docs]    def copy(self, *args, **kwargs):
        """
        Make a copy of this object.

        See Also:
            For arguments and description of behavior see `pandas docs`_.

        .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html
        """
        cls = self.__class__    # Note that type conversion does not perform copy
        return cls(pd.DataFrame(self).copy(*args, **kwargs))

    def _revert_categories(self, inplace=True):
        """
        Inplace conversion to categories.
        """
        if inplace:
            for column, dtype in self._categories.items():
                if column in self.columns:
                    self[column] = self[column].astype(dtype)
        else:
            copy = self.copy()
            for column, dtype in copy._categories.items():
                if column in copy.columns:
                    copy[column] = copy[column].astype(dtype)
            return copy

    def _set_categories(self):
        """
        Inplace conversion from categories.
        """
        for column, _ in self._categories.items():
            if column in self.columns:
                self[column] = self[column].astype('category')

    def __init__(self, *args, **kwargs):
        super(DataFrame, self).__init__(*args, **kwargs)
        self.log.debug('shape: {}'.format(self.shape))
        if self._cardinal is not None:
            self._categories[self._cardinal[0]] = self._cardinal[1]
            self._columns.append(self._cardinal[0])
        self._set_categories()
        if len(self) > 0:
            name = self.__class__.__name__
            if self._columns:
                missing = set(self._columns).difference(self.columns)
                if missing:
                    raise RequiredColumnError(missing, name)
            if self.index.name != self._index and self._index is not None:
                if self.index.name is not None and self.index.name.decode('utf-8') != self._index:
                    warnings.warn("Object's index name changed from {} to {}".format(self.index.name, self._index))
                self.index.name = self._index


[docs]class Field(DataFrame):
    """
    A field is defined by field data and field values. Field data defines the
    discretization of the field (i.e. its origin in a given space, number of
    steps/step spaceing, and endpoint for example). Field values can be scalar
    (series) and/or vector (dataframe) data defining the magnitude and/or direction
    at each given point.

    Note:
        The convention for generating the discrete field data and ordering of
        the field values must be the same (e.g. discrete field points are
        generated x, y, then z and scalar field values are a series object
        ordered looping first over x then y, then z).

    In addition to the :class:`~exatomic.exa.core.numerical.DataFrame` attributes, this object
    has the following:
    """
    @property
    def _constructor(self):
        return Field

[docs]    def copy(self, *args, **kwargs):
        """
        Make a copy of this object.

        Note:
            Copies both field data and field values.

        See Also:
            For arguments and description of behavior see `pandas docs`_.

        .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html
        """
        cls = self.__class__    # Note that type conversion does not perform copy
        data = pd.DataFrame(self).copy(*args, **kwargs)
        values = [field.copy() for field in self.field_values]
        return cls(data, field_values=values)

[docs]    def memory_usage(self):
        """
        Get the combined memory usage of the field data and field values.
        """
        data = super(Field, self).memory_usage()
        values = 0
        for value in self.field_values:
            values += value.memory_usage()
        data['field_values'] = values
        return data

[docs]    def slice_naive(self, key):
        """
        Naively (on index) slice the field data and values.

        Args:
            key: Int, slice, or iterable to select data and values

        Returns:
            field: Sliced field object
        """
        cls = self.__class__
        key = check_key(self, key)
        enum = pd.Series(range(len(self)))
        enum.index = self.index
        values = self.field_values[enum[key].values]
        data = self.loc[key]
        return cls(data, field_values=values)

    #def slice_cardinal(self, key):
    #    cls = self.__class__
    #    grpby = self.cardinal_groupby()

    def __init__(self, *args, **kwargs):
        # The following check allows creation of a single field (whose field data
        # comes from a series object and field values from another series object).
        field_values = kwargs.pop("field_values", None)
        if args and isinstance(args[0], pd.Series):
            args = (args[0].to_frame().T, )
        super(Field, self).__init__(*args, **kwargs)
        self._metadata = ['field_values']
        if isinstance(field_values, (list, tuple, np.ndarray)):
            self.field_values = [Series(v) for v in field_values]    # Convert type for nice repr
        elif field_values is None:
            self.field_values = []
        elif isinstance(field_values, pd.Series):
            self.field_values = [Series(field_values)]
        else:
            raise TypeError("Wrong type for field_values with type {}".format(type(field_values)))
        for i in range(len(self.field_values)):
            self.field_values[i].name = i
        self.log.info('contains {} fields'.format(len(self.field_values)))


[docs]class Field3D(Field):
    """
    Dataframe for storing dimensions of a scalar or vector field of 3D space.

    +-------------------+----------+-------------------------------------------+
    | Column            | Type     | Description                               |
    +===================+==========+===========================================+
    | nx                | int      | number of grid points in x                |
    +-------------------+----------+-------------------------------------------+
    | ny                | int      | number of grid points in y                |
    +-------------------+----------+-------------------------------------------+
    | nz                | int      | number of grid points in z                |
    +-------------------+----------+-------------------------------------------+
    | ox                | float    | field origin point in x                   |
    +-------------------+----------+-------------------------------------------+
    | oy                | float    | field origin point in y                   |
    +-------------------+----------+-------------------------------------------+
    | oz                | float    | field origin point in z                   |
    +-------------------+----------+-------------------------------------------+
    | xi                | float    | First component in x                      |
    +-------------------+----------+-------------------------------------------+
    | xj                | float    | Second component in x                     |
    +-------------------+----------+-------------------------------------------+
    | xk                | float    | Third component in x                      |
    +-------------------+----------+-------------------------------------------+
    | yi                | float    | First component in y                      |
    +-------------------+----------+-------------------------------------------+
    | yj                | float    | Second component in y                     |
    +-------------------+----------+-------------------------------------------+
    | yk                | float    | Third component in y                      |
    +-------------------+----------+-------------------------------------------+
    | zi                | float    | First component in z                      |
    +-------------------+----------+-------------------------------------------+
    | zj                | float    | Second component in z                     |
    +-------------------+----------+-------------------------------------------+
    | zk                | float    | Third component in z                      |
    +-------------------+----------+-------------------------------------------+

    Note:
        Each field should be flattened into an N x 1 (scalar) or N x 3 (vector)
        series or dataframe respectively. The orientation of the flattening
        should have x as the outer loop and z values as the inner loop (for both
        cases). This is sometimes called C-major or C-style order, and has
        the last index changing the fastest and the first index changing the
        slowest.

    See Also:
        :class:`~exatomic.exa.core.numerical.Field`
    """
    _columns = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk',
                'yi', 'yj', 'yk', 'zi', 'zj', 'zk']

    @property
    def _constructor(self):
        return Field3D


[docs]def check_key(data_object, key, cardinal=False):
    """
    Update the value of an index key by matching values or getting positionals.
    """
    itype = (int, np.int32, np.int64)
    if not isinstance(key, itype + (slice, tuple, list, np.ndarray)):
        raise KeyError("Unknown key type {} for key {}".format(type(key), key))
    keys = data_object.index.values
    if cardinal and data_object._cardinal is not None:
        keys = data_object[data_object._cardinal[0]].unique()
    elif isinstance(key, itype) and (key in keys or key < 0):
        key = keys[key]
        if isinstance(key, itype):
            key = [key]
        else:
            key = list(sorted(key))
    elif isinstance(key, itype):
        key = [key]
    elif isinstance(key, slice):
        key = list(sorted(keys[key]))
    elif isinstance(key, (tuple, list, pd.Index)) and not np.all(k in keys for k in key):
        key = list(sorted(keys[key]))
    return key


[docs]class SparseDataFrame(BaseDataFrame):
    @property
    def _constructor(self):
        return SparseDataFrame