Source code for exatomic.algorithms.neighbors

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Neighbor Selection Algorithms
###############################
This module provides algoirthms for selecting nearest neighbors, e.g. nearest
solvent molecules to a solute molecule. Because two body properties do not always
represent the desired molecules (i.e. bonds appear where they are not desired),
these algorithms are not completely black box.

Before performing a search, check that the molecule table is computed as desired
and classified (if necessary): see :func:`~exatomic.two.BaseTwo.compute_bonds`
and :func:`~exatomic.molecule.Molecule.classify`.
"""
import numpy as np
import pandas as pd
import numba as nb
from collections import defaultdict
from IPython.display import display
from ipywidgets import FloatProgress
from exatomic.base import nbpll
from exatomic.core.atom import Atom
from exatomic.core.universe import Universe


@nb.jit(nopython=True, parallel=nbpll)
def _worker(idx, x, y, z, a):
    """
    Generate a 3x3x3 'super' cell from a cubic unit cell.

    Args:
        idx (array): Array of index values
        x (array): Array of x coordinates
        y (array): Array of y coordinates
        z (array): Array of z coordinates
        a (float): Cubic unit cell dimension
    """
    n = len(x)
    idxs = np.empty((27*n, ), dtype=np.int64)
    prj = idxs.copy()
    px = np.empty((27*n, ), dtype=np.float64)
    py = px.copy()
    pz = px.copy()
    p = 0
    m = 0
    for i in [-1, 0, 1]:
        for j in [-1, 0, 1]:
            for k in [-1, 0, 1]:
                for l in range(n):
                    idxs[m] = idx[l]
                    px[m] = x[l] + i*a
                    py[m] = y[l] + j*a
                    pz[m] = z[l] + k*a
                    prj[m] = p
                    m += 1
                p += 1
    return idxs, px, py, pz, prj


def _create_super_universe(u, a):
    """
    Generate a 3x3x3 super cell from a cubic periodic universe

    Args:
        u (:class:`~exatomic.core.universe.Universe`): Universe
        a (float): Cubic unit cell dimension

    Returns:
        uni (:class:`~exatomic.core.universe.Universe`): Universe of 3x3x3x super cell
    """
    adxs = []
    xs = []
    ys = []
    zs = []
    prjs = []
    fdxs = []
    grps = u.atom.groupby("frame")
    for fdx, atom in grps:
        adx, x, y, z, prj = _worker(atom.index.values.astype(np.int64),
                                    atom['x'].values.astype(np.float64),
                                    atom['y'].values.astype(np.float64),
                                    atom['z'].values.astype(np.float64), a)
        adxs.append(adx)
        xs.append(x)
        ys.append(y)
        zs.append(z)
        prjs.append(prj)
        fdxs += [fdx]*len(adx)
    adxs = np.concatenate(adxs)
    xs = np.concatenate(xs)
    ys = np.concatenate(ys)
    zs = np.concatenate(zs)
    prjs = np.concatenate(prjs)
    # Overwrite the last 'atom' group because that value doesn't need to exist anymore
    atom = pd.DataFrame.from_dict({'atom': adxs, 'x': xs, 'y': ys, 'z': zs, 'prj': prjs})
    atom['frame'] = fdxs
    atom['symbol'] = atom['atom'].map(u.atom['symbol'])
    atom['label'] = atom['atom'].map(u.atom['label'])
    atom = Atom(atom)
    return Universe(atom=atom)


[docs]def periodic_nearest_neighbors_by_atom(uni, source, a, sizes, **kwargs):
    """
    Determine nearest neighbor molecules to a given source (or sources) and
    return the data as a dataframe.

    Warning:
        For universes with more than about 250 atoms, consider using the
        slower but more memory efficient
        :func:`~exatomic.algorithms.neighbors.periodic_nearest_neighbors_by_atom_large`.

    For a simple cubic periodic system with unit cell dimension ``a``,
    clusters can be generated as follows. In the example below, additional
    keyword arguments have been included as they are almost always required
    in order to correctly identify molecular units semi-empirically.

    .. code-block:: python

        periodic_nearest_neighbors_by_atom(u, [0], 40.0, [0, 5, 10, 50],
                                           dmax=40.0, C=1.6, O=1.6)

    Argument descriptions can be found below. The additional keyword arguments,
    ``dmax``, ``C``, ``O``, are passed directly to the two body computation used
    to determine (semi-empirically) molecular units. Note that although molecules
    are computed, neighboring molecular units are determine by an atom to atom
    criteria.

    Args:
        uni (:class:`~exatomic.core.universe.Universe`): Universe
        source (int, str, list): Integer label or string symbol of source atom
        a (float): Cubic unit cell dimension
        sizes (list): List of slices to create
        kwargs: Additional keyword arguments to be passed to atom two body calculation

    Returns:
        dct (dict): Dictionary of sliced universes and nearest neighbor table

    See Also:
        Sliced universe construction can be facilitated by
        :func:`~exatomic.algorithms.neighbors.construct`.
    """
    def sorter(group, source_atom_idxs):
        s = group[['atom0', 'atom1']].stack()
        return s[~s.isin(source_atom_idxs)].reset_index()

    if "label" not in uni.atom.columns:
        uni.atom['label'] = uni.atom.get_atom_labels()
    dct = defaultdict(list)
    grps = uni.atom.groupby("frame")
    ntot = len(grps)
    fp = FloatProgress(description="Slicing:")
    display(fp)
    for i, (fdx, atom) in enumerate(grps):
        if len(atom) > 0:
            uu = _create_super_universe(Universe(atom=atom.copy()), a)
            uu.compute_atom_two(**kwargs)
            uu.compute_molecule()
            if isinstance(source, (int, np.int32, np.int64)):
                source_atom_idxs = uu.atom[(uu.atom.index.isin([source])) &
                                           (uu.atom['prj'] == 13)].index.values
            elif isinstance(source, (list, tuple)):
                source_atom_idxs = uu.atom[uu.atom['label'].isin(source) &
                                           (uu.atom['prj'] == 13)].index.values
            else:
                source_atom_idxs = uu.atom[(uu.atom['symbol'] == source) &
                                           (uu.atom['prj'] == 13)].index.values
            source_molecule_idxs = uu.atom.loc[source_atom_idxs, 'molecule'].unique().astype(int)
            uu.atom_two['frame'] = uu.atom_two['atom0'].map(uu.atom['frame'])
            nearest_atoms = uu.atom_two[(uu.atom_two['atom0'].isin(source_atom_idxs)) |
                                        (uu.atom_two['atom1'].isin(source_atom_idxs))].sort_values("dr")[['frame', 'atom0', 'atom1']]
            nearest = nearest_atoms.groupby("frame").apply(sorter, source_atom_idxs=source_atom_idxs)
            del nearest['level_1']
            nearest.index.names = ['frame', 'idx']
            nearest.columns = ['two', 'atom']
            nearest['molecule'] = nearest['atom'].map(uu.atom['molecule'])
            nearest = nearest[~nearest['molecule'].isin(source_molecule_idxs)]
            nearest = nearest.drop_duplicates('molecule', keep='first')
            nearest.reset_index(inplace=True)
            nearest['frame'] = nearest['frame'].astype(int)
            nearest['molecule'] = nearest['molecule'].astype(int)
            dct['nearest'].append(nearest)
            for nn in sizes:
                atm = []
                for j, fdx in enumerate(nearest['frame'].unique()):
                    mdxs = nearest.loc[nearest['frame'] == fdx, 'molecule'].tolist()[:nn]
                    mdxs.append(source_molecule_idxs[j])
                    atm.append(uu.atom[uu.atom['molecule'].isin(mdxs)][['symbol', 'x', 'y', 'z', 'frame']].copy())
                dct[nn].append(pd.concat(atm, ignore_index=True))
        fp.value = i/ntot*100
    dct['nearest'] = pd.concat(dct['nearest'], ignore_index=True)
    for nn in sizes:
        dct[nn] = Universe(atom=pd.concat(dct[nn], ignore_index=True))
    fp.close()
    return dct


[docs]def periodic_nearest_neighbors_by_atom_large(uni, source, a, sizes, **kwargs):
    """
    Determine nearest neighbor molecules to a given source (or sources) and
    return the data as a dataframe.

    Tip:
        This function performs the same operation as
        :func:`~exatomic.algorithms.neighbors.periodic_nearest_neighbors_by_atom`,
        but is meant for universes containing more than about 250 atoms
        per frame (the referenced function will be faster for smaller universes).

    For a simple cubic periodic system with unit cell dimension ``a``,
    clusters can be generated as follows. In the example below, additional
    keyword arguments have been included as they are almost always required
    in order to correctly identify molecular units semi-empirically.

    .. code-block:: python

        periodic_nearest_neighbors_by_atom_ooc(u, [0], 40.0, [0, 5, 10, 50],
                                           dmax=40.0, C=1.6, O=1.6)

    Argument descriptions can be found below. The additional keyword arguments,
    ``dmax``, ``C``, ``O``, are passed directly to the two body computation used
    to determine (semi-empirically) molecular units. Note that although molecules
    are computed, neighboring molecular units are determine by an atom to atom
    criteria.

    Args:
        uni (:class:`~exatomic.core.universe.Universe`): Universe
        source (int, str, list): Integer label or string symbol of source atom
        a (float): Cubic unit cell dimension
        sizes (iterable): List of slices to create
        kwargs: Additional keyword arguments to be passed to atom two body calculation

    Returns:
        dct (dict): Dictionary of sliced universes and nearest neighbor table

    See Also:
        Sliced universe construction can be facilitated by
        :func:`~exatomic.algorithms.neighbors.construct`.
    """
    if "label" not in uni.atom.columns:
        uni.atom['label'] = uni.atom.get_atom_labels()
    if not isinstance(sizes, list):
        raise TypeError("Argument sizes must be iterable of ints.")
    dct = defaultdict(list)
    grps = uni.atom.groupby("frame")
    ntot = len(grps)
    fp = FloatProgress(description="Slicing:")
    display(fp)
    for i, (fdx, atom) in enumerate(grps):
        if len(atom) > 0:
            uu = Universe(atom=atom.copy())
            uu.frame = uni.frame.loc[[fdx], :]
            uu.compute_atom_two(**kwargs)
            uu.compute_molecule()
            if isinstance(source, (int, np.int32, np.int64)):
                source_atom_idxs = [source]
            elif isinstance(source, np.ndarray):
                source_atom_idxs = source.tolist()
            elif isinstance(source, (list, tuple)):
                source_atom_idxs = uu.atom[uu.atom['label'].isin(source) |
                                           uu.atom['symbol'].isin(source)].index.astype(int).tolist()
            else:
                source_atom_idxs = uu.atom[uu.atom['symbol'] == source].index.astype(int).tolist()
            source_molecule_idxs = uu.atom.loc[source_atom_idxs, 'molecule'].unique().astype(int).tolist()
            # Identify the nearest molecules
            nearest_atoms = uu.atom_two[uu.atom_two['atom0'].isin(source_atom_idxs) |
                                        uu.atom_two['atom1'].isin(source_atom_idxs)].sort_values('dr')[['atom0', 'atom1']].copy()
            nearest_atoms['molecule0'] = nearest_atoms['atom0'].map(uu.atom['molecule'])
            nearest_atoms['molecule1'] = nearest_atoms['atom1'].map(uu.atom['molecule'])
            nearest_molecules = nearest_atoms[['molecule0', 'molecule1']].stack()
            nearest_molecules = nearest_molecules[~nearest_molecules.isin(source_molecule_idxs)].drop_duplicates(keep='first')
            # Build the appropriate universes
            for nn in sizes:
                atom1 = uu.atom.loc[uu.atom['molecule'].isin(nearest_molecules.iloc[:nn].tolist()+source_molecule_idxs),
                                   ['symbol', 'x', 'y', 'z']]
                adxs, x, y, z, prj = _worker(atom1.index.values.astype(int),
                                             atom1['x'].values.astype(float),
                                             atom1['y'].values.astype(float),
                                             atom1['z'].values.astype(float), a)
                patom = pd.DataFrame.from_dict({'atom': adxs, 'x': x, 'y': y, 'z': z, 'prj': prj})
                patom['frame'] = patom['atom'].map(uu.atom['frame'])
                patom['symbol'] = patom['atom'].map(uu.atom['symbol'])
                sliced_u = Universe(atom=patom)
                sliced_u.compute_atom_two(dmax=a)
                sliced_u.compute_molecule()
                source_adxs1 = sliced_u.atom[(sliced_u.atom['prj'] == 13) & sliced_u.atom['atom'].isin(source_atom_idxs)].index
                source_mdxs1 = sliced_u.atom.loc[source_adxs1, 'molecule'].unique().tolist()
                nearest_atoms1 = sliced_u.atom_two[sliced_u.atom_two['atom0'].isin(source_adxs1) |
                                                   sliced_u.atom_two['atom1'].isin(source_adxs1)].sort_values('dr')[['atom0', 'atom1']].copy()
                nearest_atoms1['molecule0'] = nearest_atoms1['atom0'].map(sliced_u.atom['molecule'])
                nearest_atoms1['molecule1'] = nearest_atoms1['atom1'].map(sliced_u.atom['molecule'])
                nearest_molecules1 = nearest_atoms1[['molecule0', 'molecule1']].stack()
                nearest_molecules1 = nearest_molecules1[~nearest_molecules1.isin(source_mdxs1)].drop_duplicates(keep='first')
                # Its fine to overwrite atom1 above since the uu.atom slice is not necessarily clustered
                atom1 = sliced_u.atom.loc[sliced_u.atom['molecule'].isin(nearest_molecules1.iloc[:nn].tolist()+source_mdxs1)].copy()
                dct[nn].append(atom1)
            index = nearest_molecules.index.get_level_values(0)
            nearest_molecules = nearest_molecules.to_frame()
            nearest_molecules.columns = ['molecule']
            nearest_molecules['frame'] = fdx
            nearest_molecules['atom0'] = nearest_atoms.loc[index, 'atom0'].values
            nearest_molecules['atom1'] = nearest_atoms.loc[index, 'atom1'].values
            dct['nearest'].append(nearest_molecules)
        fp.value = i/ntot*100
    dct['nearest'] = pd.concat(dct['nearest'], ignore_index=True)
    for nn in sizes:
        dct[nn] = Universe(atom=pd.concat(dct[nn], ignore_index=True))
    fp.close()
    return dct