Source code for exatomic.exa.core.container

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Container
########################
The :class:`~exatomic.exa.core.container.Container` class is the primary object for
data processing, analysis, and visualization. In brief, containers are composed
of data objects whose contents are used for 2D and 3D visualization. Containers
also provide some content management and data relationship features.

See Also:
    For a description of data objects see :mod:`~exatomic.exa.core.numerical`.
"""
import os
import logging
from uuid import uuid4
from sys import getsizeof
from copy import deepcopy
from collections import defaultdict
import numpy as np
import pandas as pd
from pandas.core.dtypes.dtypes import CategoricalDtype
import networkx as nx
import matplotlib.pyplot as plt
from exatomic.exa.util.utility import convert_bytes
from exatomic.exa.util import mpl
from .numerical import check_key, Field, Series, DataFrame


[docs]class Container(object):
    """
    Container class responsible for all features related to data management.
    """
    _getter_prefix = 'compute'
    _cardinal = None    # Name of the cardinal data table

    @property
    def log(self):
        name = '.'.join([self.__module__,
                         self.__class__.__name__])
        return logging.getLogger(name)

[docs]    def copy(self, name=None, description=None, meta=None):
        """
        Create a copy of the current object (may alter the container's name,
        description, and update the metadata if needed).
        """
        cls = self.__class__
        kwargs = self._rel(copy=True)
        kwargs.update(self._data(copy=True))
        if name is not None:
            kwargs['name'] = name
        if description is not None:
            kwargs['description'] = description
        if meta is not None:
            kwargs['meta'] = meta
        return cls(**kwargs)

[docs]    def concat(self, *args, **kwargs):
        """
        Concatenate any number of container objects with the current object into
        a single container object.

        See Also:
            For argument description, see :func:`~exatomic.exa.core.container.concat`.
        """
        raise NotImplementedError()

[docs]    def slice_naive(self, key):
        """
        Naively slice each data object in the container by the object's index.

        Args:
            key: Int, slice, or list by which to extra "sub"-container

        Returns:
            sub: Sub container of the same format with a view of the data

        Warning:
            To ensure that a new container is created, use the copy method.

            .. code-block:: Python

                mycontainer[slice].copy()
        """
        kwargs = {'name': self.name, 'description': self.description, 'meta': self.meta}
        for name, data in self._data().items():
            k = name[1:] if name.startswith('_') else name
            kwargs[k] = data.slice_naive(key)
        return self.__class__(**kwargs)

[docs]    def slice_cardinal(self, key):
        """
        Slice the container according to its (primary) cardinal axis.

        The "cardinal" axis can have any name so long as the name matches a
        data object attached to the container. The index name for this object
        should also match the value of the cardinal axis.

        The algorithm builds a network graph representing the data relationships
        (including information about the type of relationship) and then traverses
        the edge tree (starting from the cardinal table). Each subsequent child
        object in the tree is sliced based on its relationship with its parent.

        Note:
            Breadth first traversal is performed.

        Warning:
            This function does not make a copy (if possible): to ensure a new
            object is created (a copy) use :func:`~exatomic.exa.core.container.Container.copy`
            after slicing.

            .. code-block:: Python

                myslice = mycontainer[::2].copy()

        See Also:
            For data network generation, see :func:`~exatomic.exa.core.container.Container.network`.
            For information about relationships between data objects see
            :mod:`~exatomic.exa.core.numerical`.
        """
        if self._cardinal:
            cls = self.__class__
            key = check_key(self[self._cardinal], key, cardinal=True)
            g = self.network(fig=False)
            kwargs = {self._cardinal: self[self._cardinal].loc[key], 'name': self.name,
                      'description': self.description, 'meta': self.meta}
            # Next traverse, breadth first, all data objects
            for parent, child in nx.bfs_edges(g, self._cardinal):
                if child in kwargs:
                    continue
                typ = g.edge_types[(parent, child)]
                if self._cardinal in self[child].columns and hasattr(self[child], 'slice_cardinal'):
                    kwargs[child] = self[child].slice_cardinal(key)
                elif typ == 'index-index':
                    # Select from the child on the parent's index (the parent is
                    # in the kwargs already).
                    kwargs[child] = self[child].loc[kwargs[parent].index.values]
                elif typ == 'index-column':
                    # Select from the child where the column (of the same name as
                    # the parent) is in the parent's index values
                    cdf = self[child]
                    kwargs[child] = cdf[cdf[parent].isin(kwargs[parent].index.values)]
                elif typ == 'column-index':
                    # Select from the child where the child's index is in the
                    # column of the parent. Note that this relationship
                    cdf = self[child]
                    cin = cdf.index.name
                    cols = [col for col in kwargs[parent] if cin == col or (cin == col[:-1] and col[-1].isdigit())]
                    index = kwargs[parent][cols].stack().astype(np.int64).values
                    kwargs[child] = cdf[cdf.index.isin(index)]
            return cls(**kwargs)

[docs]    def cardinal_groupby(self):
        """
        Create an instance of this class for every step in the cardinal dimension.
        """
        if self._cardinal:
            g = self.network(fig=False)
            cardinal_indexes = self[self._cardinal].index.values
            selfs = {}
            cls = self.__class__
            for cardinal_index in cardinal_indexes:
                kwargs = {self._cardinal: self[self._cardinal].loc[[cardinal_index]]}
                for parent, child in nx.bfs_edges(g):
                    if child in kwargs:
                        continue
                    typ = g.edge_types[(parent, child)]
                    if self._cardinal in self[child].columns and hasattr(self[child], 'slice_cardinal'):
                        kwargs[child] = self[child].slice_cardinal(key)
                    elif typ == 'index-index':
                        # Select from the child on the parent's index (the parent is
                        # in the kwargs already).
                        kwargs[child] = self[child].loc[kwargs[parent].index.values]
                    elif typ == 'index-column':
                        # Select from the child where the column (of the same name as
                        # the parent) is in the parent's index values
                        cdf = self[child]
                        kwargs[child] = cdf[cdf[parent].isin(kwargs[parent].index.values)]
                    elif typ == 'column-index':
                        # Select from the child where the child's index is in the
                        # column of the parent. Note that this relationship
                        cdf = self[child]
                        cin = cdf.index.name
                        cols = [col for col in kwargs[parent] if cin == col or (cin == col[:-1] and col[-1].isdigit())]
                        index = kwargs[parent][cols].stack().astype(np.int64).values
                        kwargs[child] = cdf[cdf.index.isin(index)]
                selfs[cardinal_index] = cls(**kwargs)
        return selfs

[docs]    def info(self):
        """
        Display information about the container's data objects (note that info
        on metadata and visualization objects is also provided).

        Note:
            Sizes are reported in bytes.
        """
        names = []
        types = []
        sizes = []
        names.append('WIDGET')
        types.append('-')
        s = 0
        sizes.append(s)
        names.append('METADATA')
        types.append('-')
        s = 0
        for obj in self._rel().values():
            s += getsizeof(obj)
        sizes.append(s)
        for name, obj in self._data().items():
            names.append(name[1:] if name.startswith('_') else name)
            types.append('.'.join((obj.__module__, obj.__class__.__name__)))
            if isinstance(obj, pd.Series):
                sizes.append(obj.memory_usage())
            else:
                sizes.append(obj.memory_usage().sum())
        inf = pd.DataFrame.from_dict({'object': names, 'type': types, 'size': sizes})
        inf.set_index('object', inplace=True)
        return inf.sort_index()

[docs]    def memory_usage(self, string=False):
        """
        Get the memory usage estimate of the container.

        Args:
            string (bool): Human readable string (default false)

        See Also:
            :func:`~exatomic.exa.core.container.Container.info`
        """
        if string:
            n = getsizeof(self)
            return ' '.join((str(s) for s in convert_bytes(n)))
        return self.info()['size']

[docs]    def network(self, figsize=(14, 9), fig=True):
        """
        Display information about the container's object relationships.

        Nodes correspond to data objects. The size of the node corresponds
        to the size of the table in memory. The color of the node corresponds
        to its fundamental data type. Nodes are labeled by their container
        name; class information is listed below. The color of the connections
        correspond to the type of relationship; either an index of one table
        corresponds to a column in another table or the two tables share an
        index.

        Args:
            figsize (tuple): Tuple containing figure dimensions
            fig (bool): Generate the figure (default true)

        Returns:
            graph: Network graph object containing data relationships
        """
        conn_types = ['index-index', 'index-column']
        conn_colors = mpl.sns.color_palette('viridis', len(conn_types))
        conn = dict(zip(conn_types, conn_colors))

        def get_node_type_color(obj):
            """Gets the color of a node based on the node's (sub)type."""
            cols = mpl.sns.color_palette('viridis', len(conn_types))
            for col in cols:
                if isinstance(obj, (pd.DataFrame, pd.Series)):
                    typ = type(obj)
                    return '.'.join((typ.__module__, typ.__name__)), col
            return 'other', 'gray'

        def legend(items, name, loc, ax):
            """Legend creation helper function."""
            proxies = []
            descriptions = []
            for label, color in items:
                if label == 'column-index':
                    continue
                if name == 'Data Type':
                    line = mpl.sns.mpl.lines.Line2D([], [], linestyle='none', color=color, marker='o')
                else:
                    line = mpl.sns.mpl.lines.Line2D([], [], linestyle='-', color=color)
                proxies.append(line)
                descriptions.append(label)
            lgnd = ax.legend(proxies, descriptions, title=name, loc=loc, frameon=True)
            lgnd_frame = lgnd.get_frame()
            lgnd_frame.set_facecolor('white')
            lgnd_frame.set_edgecolor('black')
            return lgnd, ax

        info = self.info()
        info = info[info['type'] != '-']
        #info['size'] *= 13000/info['size'].max()
        info['size'] += 2000
        node_size_dict = info['size'].to_dict()      # Can pull all nodes from keys
        node_class_name_dict = info['type'].to_dict()
        node_type_dict = {}    # Values are tuple of "underlying" type and color
        node_conn_dict = {}    # Values are tuple of connection type and color
        items = self._data().items()
        for k0, v0 in items:
            n0 = k0[1:] if k0.startswith('_') else k0
            node_type_dict[n0] = get_node_type_color(v0)
            for k1, v1 in items:
                if v0 is v1:
                    continue
                n1 = k1[1:] if k1.startswith('_') else k1
                for name in v0.index.names:    # Check the index of data object 0 against the index
                    if name is None:           # and columns of data object 1
                        continue
                    if name in v1.index.names:
                        contyp = 'index-index'
                        node_conn_dict[(n0, n1)] = (contyp, conn[contyp])
                        node_conn_dict[(n1, n0)] = (contyp, conn[contyp])
                    if hasattr(v1, "columns"):
                        for col in v1.columns:
                            # Catches index "atom", column "atom1"; does not catch atom10
                            if name == col or (name == col[:-1] and col[-1].isdigit()):
                                contyp = 'index-column'
                                node_conn_dict[(n0, n1)] = (contyp, conn[contyp])
                                node_conn_dict[(n1, n0)] = ('column-index', conn[contyp])
        g = nx.Graph()
        g.add_nodes_from(node_size_dict.keys())
        g.add_edges_from(node_conn_dict.keys())
        node_sizes = [node_size_dict[node] for node in g.nodes()]
        node_labels = {node: '{}\n({}'.format(node, node_class_name_dict[node]) for node in g.nodes()}
        node_colors = [node_type_dict[node][1] for node in g.nodes()]
        edge_colors = [node_conn_dict[edge][1] for edge in g.edges()]
        # Build the figure and legends
        if fig:
            fig, ax = plt.subplots(1, figsize=figsize)
            ax.axis('off')
            pos = nx.spring_layout(g)
            nx.draw_networkx_nodes(g, pos=pos, ax=ax, alpha=0.7, node_size=node_sizes,
                                   node_color=node_colors)
            nx.draw_networkx_labels(g, pos=pos, labels=node_labels, font_size=12, ax=ax)
            nx.draw_networkx_edges(g, pos=pos, edge_color=edge_colors, width=2, ax=ax)
            l1, ax = legend(set(node_conn_dict.values()), 'Connection', (1, 0), ax)
            _, ax = legend(set(node_type_dict.values()), 'Data Type', (1, 0.3), ax)
            fig.gca().add_artist(l1)
        g.edge_types = {node: value[0] for node, value in node_conn_dict.items()}  # Attached connection information to network graph
        return g

[docs]    def save(self, path=None, complevel=1, complib='zlib'):
        """
        Save the container as an HDF5 archive.

        Args:
            path (str): Path where to save the container

        Returns:
            savepath (str): Path where the container was saved
        """
        if path is None:
            path = self.uuid + '.hdf5'
        elif os.path.isdir(path):
            path += os.sep + self.uuid + '.hdf5'
        elif not (path.endswith('.hdf5') or path.endswith('.hdf')):
            raise ValueError('File path must have a ".hdf5" or ".hdf" extension.')
        with pd.HDFStore(path, 'w', complevel=complevel, complib=complib) as store:
            store['kwargs'] = pd.Series()
            store.get_storer('kwargs').attrs.metadata = self._rel()
            fc = 0    # Field counter (see special handling of fields below)
            for name, data in self._data().items():
                if hasattr(data, '_revert_categories'):
                    data._revert_categories()
                name = name[1:] if name.startswith('_') else name
                if isinstance(data, Field):    # Fields are handled separately
                    fname = 'FIELD{}_'.format(fc) + name + '/'
                    store[fname + 'data'] = pd.DataFrame(data)
                    for i, field in enumerate(data.field_values):
                        ffname = fname + 'values' + str(i)
                        if isinstance(field, pd.Series):
                            store[ffname] = pd.Series(field)
                        else:
                            store[ffname] = pd.DataFrame(field)
                    fc += 1
                elif isinstance(data, Series):
                    s = pd.Series(data)
                    if isinstance(data.dtype, CategoricalDtype):
                        s = s.astype('O')
                    store[name] = s
                elif isinstance(data, DataFrame):
                    store[name] = pd.DataFrame(data)
                else:
                    if hasattr(data, 'dtype') and isinstance(data.dtype, CategoricalDtype):
                        data = data.astype('O')
                    else:
                        for col in data:
                            if isinstance(data[col].dtype, CategoricalDtype):
                                data[col] = data[col].astype('O')
                    store[name] = data
                if hasattr(data, '_set_categories'):
                    data._set_categories()
        return path

[docs]    def to_hdf(self, *args, **kwargs):
        """Alias of :func:`~exatomic.exa.core.container.Container`."""
        self.save(*args, **kwargs)

[docs]    @classmethod
    def load(cls, pkid_or_path=None):
        """
        Load a container object from a persistent location or file path.

        Args:
            pkid_or_path: Integer pkid corresponding to the container table or file path

        Returns:
            container: The saved container object
        """
        path = pkid_or_path
        if not os.path.isfile(path):
            raise FileNotFoundError('File {} not found.'.format(path))
        kwargs = {}
        fields = defaultdict(dict)
        with pd.HDFStore(path) as store:
            for key in store.keys():
                if 'kwargs' in key:
                    kwargs.update(store.get_storer(key).attrs.metadata)
                elif "FIELD" in key:
                    name, dname = "_".join(key.split("_")[1:]).split("/")
                    dname = dname.replace('values', '')
                    fields[name][dname] = store[key]
                else:
                    name = str(key[1:])
                    kwargs[name] = store[key]
        for name, field_data in fields.items():
            fps = field_data.pop('data')
            kwargs[name] = Field(fps, field_values=[field_data[str(arr)] for arr in
                                                    sorted(map(int, field_data.keys()))])
        return cls(**kwargs)

[docs]    @classmethod
    def from_hdf(cls, *args, **kwargs):
        """Alias for :func:`~exatomic.exa.core.container.Container`."""
        return cls.load(*args, **kwargs)

    def _rel(self, copy=False):
        """
        Get descriptive kwargs of the container (e.g. name, description, meta).
        """
        rel = {}
        for key, obj in vars(self).items():
            if not isinstance(obj, (pd.Series, pd.DataFrame)) and not key.startswith('_'):
                if copy and 'id' not in key:
                    rel[key] = deepcopy(obj)
                else:
                    rel[key] = obj
        return rel

    def _data(self, copy=False):
        """
        Get data kwargs of the container (i.e. dataframe and series objects).
        """
        data = {}
        for key, obj in vars(self).items():
            if isinstance(obj, (pd.Series, pd.DataFrame)):
                if copy:
                    data[key] = obj.copy(deep=True)
                else:
                    data[key] = obj
        return data

    def __delitem__(self, key):
        if key in vars(self):
            del self.__dict__[key]

    def __sizeof__(self):
        """Note that this function must return a Python integer."""
        return int(self.info()['size'].sum())

    def __getitem__(self, key):
        if isinstance(key, str):
            return getattr(self, key)
        elif isinstance(key, (int, slice, list)) and self._cardinal is None:
            return self.slice_naive(key)
        elif isinstance(key, (int, slice, list)) and self._cardinal is not None:
            return self.slice_cardinal(key)
        raise KeyError()

    def __init__(self, name=None, description=None, meta=None, uuid=None, **kwargs):
        self.log.info('adding {} attrs'.format(len(kwargs)))
        for key, value in kwargs.items():
            setattr(self, key, value)
        self.name = name
        self.description = description
        self.meta = meta
        self.uuid = uuid
        if uuid is None:
            self.uuid = str(uuid4())


[docs]class TypedMeta(type):
    """
    This metaclass creates statically typed class attributes using the property
    framework.

    .. code-block:: Python

        class TestMeta(TypedMeta):
            attr1 = (int, float)
            attr2 = DataFrame

        class TestClass(metaclass=TestMeta):
            def __init__(self, attr1, attr2):
                self.attr1 = attr1
                self.attr2 = attr2

    The above code dynamically creates code that looks like the following:

    .. code-block:: Python

        class TestClass:
            @property
            def attr1(self):
                return self._attr1

            @attr1.setter
            def attr1(self, obj):
                if not isinstance(obj, (int, float)):
                    raise TypeError('attr1 must be int')
                self._attr1 = obj

            @attr1.deleter
            def attr1(self):
                del self._attr1

            @property
            def attr2(self):
                return self._attr2

            @attr2.setter
            def attr2(self, obj):
                if not isinstance(obj, DataFrame):
                    raise TypeError('attr2 must be DataFrame')
                self._attr2 = obj

            @attr2.deleter
            def attr2(self):
                del self._attr2

            def __init__(self, attr1, attr2):
                self.attr1 = attr1
                self.attr2 = attr2
    """
[docs]    @staticmethod
    def create_property(name, ptype):
        """
        Creates a custom property with a getter that performs computing
        functionality (if available) and raise a type error if setting
        with the wrong type.

        Note:
            By default, the setter attempts to convert the object to the
            correct type; a type error is raised if this fails.
        """
        pname = '_' + name
        def getter(self):
            # This will be where the data is store (e.g. self._name)
            # This is the default property "getter" for container data objects.
            # If the property value is None, this function will check for a
            # convenience method with the signature, self.compute_name() and call
            # it prior to returning the property value.
            if not hasattr(self, pname) and hasattr(self, f'{self._getter_prefix}{pname}'):
                self[f'{self._getter_prefix}{pname}']()
            if not hasattr(self, pname):
                raise AttributeError(f'Please compute or set {name} first.')
            return getattr(self, pname)

        def setter(self, obj):
            # This is the default property "setter" for container data objects.
            # Prior to setting a property value, this function checks that the
            # object's type is correct.
            if not isinstance(obj, ptype):
                try:
                    obj = ptype(obj)
                except Exception:
                    raise TypeError('Must be able to convert object {0} to {1} (or must be of type {1})'.format(name, ptype))
            setattr(self, pname, obj)

        def deleter(self):
            # Deletes the property's value.
            del self[pname]

        return property(getter, setter, deleter)

    def __new__(mcs, name, bases, clsdict):
        """
        Modification of the class definition occurs here; we iterate over all
        statically typed attributes and attach their property (see
        :func:`~exatomic.exa.container.TypedMeta.create_property`) definition, returning
        the new class definition.
        """
        for k, v in vars(mcs).items():
            if isinstance(v, type) and not k.startswith('_'):
                clsdict[k] = mcs.create_property(k, v)
        return super(TypedMeta, mcs).__new__(mcs, name, bases, clsdict)