Source code for exatomic.core.molecule

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Molecule Table
###################
"""
import numpy as np
import pandas as pd
import networkx as nx
import warnings
from networkx.algorithms.components import connected_components
from exatomic.exa import DataFrame
from exatomic.base import sym2mass
from exatomic.formula import string_to_dict, dict_to_string


[docs]class Molecule(DataFrame):
    """
    Description of molecules in the atomic universe.
    """
    _index = 'molecule'
    _categories = {'frame': np.int64, 'formula': str, 'classification': object}

    #@property
    #def _constructor(self):
    #    return Molecule

[docs]    def classify(self, *classifiers):
        """
        Classify molecules into arbitrary categories.

        .. code-block:: Python

            u.molecule.classify(('solute', 'Na'), ('solvent', 'H(2)O(1)'))

        Args:
            classifiers: Any number of tuples of the form ('label', 'identifier', exact) (see below)

        Note:
            A classifier has 3 parts, "label", e.g. "solvent", "identifier", e.g.
            "H(2)O(1)", and exact (true or false). If exact is false (default),
            classification is greedy and (in this example) molecules with formulas
            "H(1)O(1)", "H(3)O(1)", etc. would get classified as "solvent". If,
            instead, exact were set to true, those molecules would remain
            unclassified.

        Warning:
            Classifiers are applied in the order passed; where identifiers overlap,
            the latter classification is used.

        See Also:
            :func:`~exatomic.algorithms.nearest.compute_nearest_molecules`
        """
        for c in classifiers:
            n = len(c)
            if n != 3 and n != 2:
                raise ClassificationError()
        self['classification'] = None
        for classifier in classifiers:
            identifier = string_to_dict(classifier[0])
            classification = classifier[1]
            exact = classifier[2] if len(classifier) == 3 else False
            this = self
            for symbol, count in identifier.items():
                this = this[this[symbol] == count] if exact else this[this[symbol] >= 1]
            if len(this) > 0:
                self.ix[self.index.isin(this.index), 'classification'] = classification
            else:
                raise KeyError('No records found for {}, with identifier {}.'.format(classification, identifier))
        self['classification'] = self['classification'].astype('category')
        if len(self[self['classification'].isnull()]) > 0:
            warnings.warn("Unclassified molecules remaining...")

[docs]    def get_atom_count(self):
        """
        Compute the number of atoms per molecule.
        """
        symbols = self._get_symbols()
        return self[symbols].sum(axis=1)

[docs]    def get_formula(self, as_map=False):
        """
        Compute the string representation of the molecule.
        """
        symbols = self._get_symbols()
        mcules = self[symbols].to_dict(orient='index')
        ret = map(dict_to_string, mcules.values())
        if as_map:
            return ret
        return list(ret)

    def _get_symbols(self):
        """
        Helper method to get atom symbols.
        """
        return [col for col in self if len(col) < 3 and col[0].istitle()]


[docs]def compute_molecule(universe):
    """
    Cluster atoms into molecules and create the :class:`~exatomic.molecule.Molecule`
    table.

    Args:
        universe: Atomic universe

    Returns:
        molecule: Molecule table

    Warning:
        This function modifies the universe's atom (:class:`~exatomic.atom.Atom`)
        table in place!
    """
    nodes = universe.atom.index.values
    bonded = universe.atom_two.loc[universe.atom_two['bond'] == True, ['atom0', 'atom1']]
    edges = zip(bonded['atom0'].astype(np.int64), bonded['atom1'].astype(np.int64))
    g = nx.Graph()
    g.add_nodes_from(nodes)
    g.add_edges_from(edges)
    # generate molecule indices for the atom table
    mapper = {}
    i = 0
    for k, v in g.degree():    # First handle single atom "molecules"
        if v == 0:
            mapper[k] = i
            i += 1
    for seht in connected_components(g):    # Second handle multi atom molecules
        for adx in seht:
            mapper[adx] = i
        i += 1
    universe.atom['molecule'] = universe.atom.index.map(lambda x: mapper[x])
    universe.atom['mass'] = universe.atom['symbol'].map(sym2mass).astype(float)
    grps = universe.atom.groupby('molecule')
    molecule = grps['symbol'].value_counts().unstack().fillna(0).astype(np.int64)
    molecule.columns.name = None
    molecule['mass'] = grps['mass'].sum()
    universe.atom['molecule'] = universe.atom['molecule'].astype('category')
    del universe.atom['mass']
    return molecule


[docs]def compute_molecule_count(universe):
    """
    """
    if 'molecule' not in universe.atom.columns:
        universe.compute_molecule()
    universe.atom._revert_categories()
    mapper = universe.atom.drop_duplicates('molecule').set_index('molecule')['frame']
    universe.atom._set_categories()
    universe.molecule['frame'] = universe.molecule.index.map(lambda x: mapper[x])
    molecule_count = universe.molecule.groupby('frame').size()
    del universe.molecule['frame']
    return molecule_count


[docs]def compute_molecule_com(universe):
    """
    Compute molecules' centers of mass.
    """
    if 'molecule' not in universe.atom.columns:
        universe.compute_molecule()
    mass = universe.atom.get_element_masses()
    if universe.frame.is_periodic():
        xyz = universe.atom[['x', 'y', 'z']].copy()
        xyz.update(universe.visual_atom)
    else:
        xyz = universe.atom[['x', 'y', 'z']]
    xm = xyz['x'].mul(mass)
    ym = xyz['y'].mul(mass)
    zm = xyz['z'].mul(mass)
    #rm = xm.add(ym).add(zm)
    df = pd.DataFrame.from_dict({'xm': xm, 'ym': ym, 'zm': zm, 'mass': mass,
                                 'molecule': universe.atom['molecule']})
    groups = df.groupby('molecule')
    sums = groups.sum()
    cx = sums['xm'].div(sums['mass'])
    cy = sums['ym'].div(sums['mass'])
    cz = sums['zm'].div(sums['mass'])
    return cx, cy, cz