Source code for exatomic.exa.core.numerical

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Data Objects
###################################
Data objects are used to store typed data coming from an external source (for
example a file on disk). There are three primary data objects provided by
this module, :class:`~exatomic.exa.core.numerical.Series`, :class:`~exatomic.exa.core.numerical.DataFrame`,
and :class:`~exatomic.exa.core.numerical.Field`. The purpose of these objects is to facilitate
conversion of data into "traits" used in visualization and enforce relationships
between data objects in a given container. Any of the objects provided by this
module may be extended.
"""
import logging
import warnings
import numpy as np
import pandas as pd
from exatomic.exa.core.error import RequiredColumnError


[docs]class Numerical(object): """ Base class for :class:`~exatomic.exa.core.numerical.Series`, :class:`~exatomic.exa.core.numerical.DataFrame`, and :class:`~exatomic.exa.numerical.Field` objects, providing default trait functionality and clean representations when present as part of containers. """ @property def log(self): name = '.'.join([self.__module__, self.__class__.__name__]) return logging.getLogger(name)
[docs] def slice_naive(self, key): """ Slice a data object based on its index, either by value (.loc) or position (.iloc). Args: key: Single index value, slice, tuple, or list of indices/positionals Returns: data: Slice of self """ cls = self.__class__ key = check_key(self, key) return cls(self.loc[key])
def __str__(self): return self.__repr__()
[docs]class BaseSeries(Numerical): """ Base class for dense and sparse series objects (labeled arrays). Attributes: _sname (str): May have a required name (default None) _iname (str): May have a required index name _stype (type): May have a required value type _itype (type): May have a required index type """ _metadata = ['name', 'meta'] # These attributes may be set when subclassing Series _sname = None # Series may have a required name _iname = None # Series may have a required index name _stype = None # Series may have a required value type _itype = None # Series may have a required index type def __init__(self, *args, **kwargs): meta = kwargs.pop('meta', None) super(BaseSeries, self).__init__(*args, **kwargs) if hasattr(self, "name") and hasattr(self, "_sname") and hasattr(self, "_iname"): if self._sname is not None and self.name != self._sname: if self.name is not None: warnings.warn("Object's name changed") self.name = self._sname if self._iname is not None and self.index.name != self._iname: if self.index.name is not None: warnings.warn("Object's index name changed") self.index.name = self._iname self.meta = meta
[docs]class BaseDataFrame(Numerical): """ Base class for dense and sparse dataframe objects (labeled matrices). Note: If the _cardinal attribute is populated, it will automatically be added to the _categories and _columns attributes. Attributes: _cardinal (tuple): Tuple of column name and raw type that acts as foreign key to index of another table _index (str): Name of index (may be used as foreign key in another table) _columns (list): Required columns _categories (dict): Dict of column names, raw types that if present will be converted to and from categoricals automatically """ _metadata = ['name', 'meta'] _cardinal = None # Tuple of column name and raw type that acts as foreign key to index of another table _index = None # Name of index (may be used as foreign key in another table) _columns = [] # Required columns _categories = {} # Dict of column names, raw types that if present will be converted to and from categoricals automatically
[docs] def cardinal_groupby(self): """ Group this object on it cardinal dimension (_cardinal). Returns: grpby: Pandas groupby object (grouped on _cardinal) """ g, t = self._cardinal self[g] = self[g].astype(t) grpby = self.groupby(g) self[g] = self[g].astype('category') return grpby
[docs] def slice_cardinal(self, key): """ Get the slice of this object by the value or values of the cardinal dimension. """ cls = self.__class__ key = check_key(self, key, cardinal=True) return cls(self[self[self._cardinal[0]].isin(key)])
def __init__(self, *args, **kwargs): meta = kwargs.pop('meta', None) super(BaseDataFrame, self).__init__(*args, **kwargs) self.meta = meta
[docs]class Series(BaseSeries, pd.Series): """ A labeled array. .. code-block:: Python class MySeries(exatomic.exa.core.numerical.Series): _sname = 'data' # series default name _iname = 'data_index' # series default index name seri = MySeries(np.random.rand(10**5)) """ @property def _constructor(self): return Series
[docs] def copy(self, *args, **kwargs): """ Make a copy of this object. See Also: For arguments and description of behavior see `pandas docs`_. .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html """ cls = self.__class__ # Note that type conversion does not perform copy return cls(pd.Series(self).copy(*args, **kwargs))
[docs]class DataFrame(BaseDataFrame, pd.DataFrame): """ A data table .. code-block:: Python class MyDF(exatomic.exa.core.numerical.DataFrame): _cardinal = ('cardinal', int) _index = 'mydf_index' _columns = ['x', 'y', 'z', 'symbol'] _categories = {'symbol': str} """ _constructor_sliced = Series @property def _constructor(self): return DataFrame
[docs] def copy(self, *args, **kwargs): """ Make a copy of this object. See Also: For arguments and description of behavior see `pandas docs`_. .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html """ cls = self.__class__ # Note that type conversion does not perform copy return cls(pd.DataFrame(self).copy(*args, **kwargs))
def _revert_categories(self, inplace=True): """ Inplace conversion to categories. """ if inplace: for column, dtype in self._categories.items(): if column in self.columns: self[column] = self[column].astype(dtype) else: copy = self.copy() for column, dtype in copy._categories.items(): if column in copy.columns: copy[column] = copy[column].astype(dtype) return copy def _set_categories(self): """ Inplace conversion from categories. """ for column, _ in self._categories.items(): if column in self.columns: self[column] = self[column].astype('category') def __init__(self, *args, **kwargs): super(DataFrame, self).__init__(*args, **kwargs) self.log.debug('shape: {}'.format(self.shape)) if self._cardinal is not None: self._categories[self._cardinal[0]] = self._cardinal[1] self._columns.append(self._cardinal[0]) self._set_categories() if len(self) > 0: name = self.__class__.__name__ if self._columns: missing = set(self._columns).difference(self.columns) if missing: raise RequiredColumnError(missing, name) if self.index.name != self._index and self._index is not None: if self.index.name is not None and self.index.name.decode('utf-8') != self._index: warnings.warn("Object's index name changed from {} to {}".format(self.index.name, self._index)) self.index.name = self._index
[docs]class Field(DataFrame): """ A field is defined by field data and field values. Field data defines the discretization of the field (i.e. its origin in a given space, number of steps/step spaceing, and endpoint for example). Field values can be scalar (series) and/or vector (dataframe) data defining the magnitude and/or direction at each given point. Note: The convention for generating the discrete field data and ordering of the field values must be the same (e.g. discrete field points are generated x, y, then z and scalar field values are a series object ordered looping first over x then y, then z). In addition to the :class:`~exatomic.exa.core.numerical.DataFrame` attributes, this object has the following: """ @property def _constructor(self): return Field
[docs] def copy(self, *args, **kwargs): """ Make a copy of this object. Note: Copies both field data and field values. See Also: For arguments and description of behavior see `pandas docs`_. .. _pandas docs: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.copy.html """ cls = self.__class__ # Note that type conversion does not perform copy data = pd.DataFrame(self).copy(*args, **kwargs) values = [field.copy() for field in self.field_values] return cls(data, field_values=values)
[docs] def memory_usage(self): """ Get the combined memory usage of the field data and field values. """ data = super(Field, self).memory_usage() values = 0 for value in self.field_values: values += value.memory_usage() data['field_values'] = values return data
[docs] def slice_naive(self, key): """ Naively (on index) slice the field data and values. Args: key: Int, slice, or iterable to select data and values Returns: field: Sliced field object """ cls = self.__class__ key = check_key(self, key) enum = pd.Series(range(len(self))) enum.index = self.index values = self.field_values[enum[key].values] data = self.loc[key] return cls(data, field_values=values)
#def slice_cardinal(self, key): # cls = self.__class__ # grpby = self.cardinal_groupby() def __init__(self, *args, **kwargs): # The following check allows creation of a single field (whose field data # comes from a series object and field values from another series object). field_values = kwargs.pop("field_values", None) if args and isinstance(args[0], pd.Series): args = (args[0].to_frame().T, ) super(Field, self).__init__(*args, **kwargs) self._metadata = ['field_values'] if isinstance(field_values, (list, tuple, np.ndarray)): self.field_values = [Series(v) for v in field_values] # Convert type for nice repr elif field_values is None: self.field_values = [] elif isinstance(field_values, pd.Series): self.field_values = [Series(field_values)] else: raise TypeError("Wrong type for field_values with type {}".format(type(field_values))) for i in range(len(self.field_values)): self.field_values[i].name = i self.log.info('contains {} fields'.format(len(self.field_values)))
[docs]class Field3D(Field): """ Dataframe for storing dimensions of a scalar or vector field of 3D space. +-------------------+----------+-------------------------------------------+ | Column | Type | Description | +===================+==========+===========================================+ | nx | int | number of grid points in x | +-------------------+----------+-------------------------------------------+ | ny | int | number of grid points in y | +-------------------+----------+-------------------------------------------+ | nz | int | number of grid points in z | +-------------------+----------+-------------------------------------------+ | ox | float | field origin point in x | +-------------------+----------+-------------------------------------------+ | oy | float | field origin point in y | +-------------------+----------+-------------------------------------------+ | oz | float | field origin point in z | +-------------------+----------+-------------------------------------------+ | xi | float | First component in x | +-------------------+----------+-------------------------------------------+ | xj | float | Second component in x | +-------------------+----------+-------------------------------------------+ | xk | float | Third component in x | +-------------------+----------+-------------------------------------------+ | yi | float | First component in y | +-------------------+----------+-------------------------------------------+ | yj | float | Second component in y | +-------------------+----------+-------------------------------------------+ | yk | float | Third component in y | +-------------------+----------+-------------------------------------------+ | zi | float | First component in z | +-------------------+----------+-------------------------------------------+ | zj | float | Second component in z | +-------------------+----------+-------------------------------------------+ | zk | float | Third component in z | +-------------------+----------+-------------------------------------------+ Note: Each field should be flattened into an N x 1 (scalar) or N x 3 (vector) series or dataframe respectively. The orientation of the flattening should have x as the outer loop and z values as the inner loop (for both cases). This is sometimes called C-major or C-style order, and has the last index changing the fastest and the first index changing the slowest. See Also: :class:`~exatomic.exa.core.numerical.Field` """ _columns = ['nx', 'ny', 'nz', 'ox', 'oy', 'oz', 'xi', 'xj', 'xk', 'yi', 'yj', 'yk', 'zi', 'zj', 'zk'] @property def _constructor(self): return Field3D
[docs]def check_key(data_object, key, cardinal=False): """ Update the value of an index key by matching values or getting positionals. """ itype = (int, np.int32, np.int64) if not isinstance(key, itype + (slice, tuple, list, np.ndarray)): raise KeyError("Unknown key type {} for key {}".format(type(key), key)) keys = data_object.index.values if cardinal and data_object._cardinal is not None: keys = data_object[data_object._cardinal[0]].unique() elif isinstance(key, itype) and (key in keys or key < 0): key = keys[key] if isinstance(key, itype): key = [key] else: key = list(sorted(key)) elif isinstance(key, itype): key = [key] elif isinstance(key, slice): key = list(sorted(keys[key])) elif isinstance(key, (tuple, list, pd.Index)) and not np.all(k in keys for k in key): key = list(sorted(keys[key])) return key
[docs]class SparseDataFrame(BaseDataFrame): @property def _constructor(self): return SparseDataFrame