Source code for exatomic.algorithms.neighbors

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Neighbor Selection Algorithms
###############################
This module provides algoirthms for selecting nearest neighbors, e.g. nearest
solvent molecules to a solute molecule. Because two body properties do not always
represent the desired molecules (i.e. bonds appear where they are not desired),
these algorithms are not completely black box.

Before performing a search, check that the molecule table is computed as desired
and classified (if necessary): see :func:`~exatomic.two.BaseTwo.compute_bonds`
and :func:`~exatomic.molecule.Molecule.classify`.
"""
import numpy as np
import pandas as pd
import numba as nb
from collections import defaultdict
from IPython.display import display
from ipywidgets import FloatProgress
from exatomic.base import nbpll
from exatomic.core.atom import Atom
from exatomic.core.universe import Universe


@nb.jit(nopython=True, parallel=nbpll)
def _worker(idx, x, y, z, a):
    """
    Generate a 3x3x3 'super' cell from a cubic unit cell.

    Args:
        idx (array): Array of index values
        x (array): Array of x coordinates
        y (array): Array of y coordinates
        z (array): Array of z coordinates
        a (float): Cubic unit cell dimension
    """
    n = len(x)
    idxs = np.empty((27*n, ), dtype=np.int64)
    prj = idxs.copy()
    px = np.empty((27*n, ), dtype=np.float64)
    py = px.copy()
    pz = px.copy()
    p = 0
    m = 0
    for i in [-1, 0, 1]:
        for j in [-1, 0, 1]:
            for k in [-1, 0, 1]:
                for l in range(n):
                    idxs[m] = idx[l]
                    px[m] = x[l] + i*a
                    py[m] = y[l] + j*a
                    pz[m] = z[l] + k*a
                    prj[m] = p
                    m += 1
                p += 1
    return idxs, px, py, pz, prj


def _create_super_universe(u, a):
    """
    Generate a 3x3x3 super cell from a cubic periodic universe

    Args:
        u (:class:`~exatomic.core.universe.Universe`): Universe
        a (float): Cubic unit cell dimension

    Returns:
        uni (:class:`~exatomic.core.universe.Universe`): Universe of 3x3x3x super cell
    """
    adxs = []
    xs = []
    ys = []
    zs = []
    prjs = []
    fdxs = []
    grps = u.atom.groupby("frame")
    for fdx, atom in grps:
        adx, x, y, z, prj = _worker(atom.index.values.astype(np.int64),
                                    atom['x'].values.astype(np.float64),
                                    atom['y'].values.astype(np.float64),
                                    atom['z'].values.astype(np.float64), a)
        adxs.append(adx)
        xs.append(x)
        ys.append(y)
        zs.append(z)
        prjs.append(prj)
        fdxs += [fdx]*len(adx)
    adxs = np.concatenate(adxs)
    xs = np.concatenate(xs)
    ys = np.concatenate(ys)
    zs = np.concatenate(zs)
    prjs = np.concatenate(prjs)
    # Overwrite the last 'atom' group because that value doesn't need to exist anymore
    atom = pd.DataFrame.from_dict({'atom': adxs, 'x': xs, 'y': ys, 'z': zs, 'prj': prjs})
    atom['frame'] = fdxs
    atom['symbol'] = atom['atom'].map(u.atom['symbol'])
    atom['label'] = atom['atom'].map(u.atom['label'])
    atom = Atom(atom)
    return Universe(atom=atom)


[docs]def periodic_nearest_neighbors_by_atom(uni, source, a, sizes, **kwargs): """ Determine nearest neighbor molecules to a given source (or sources) and return the data as a dataframe. Warning: For universes with more than about 250 atoms, consider using the slower but more memory efficient :func:`~exatomic.algorithms.neighbors.periodic_nearest_neighbors_by_atom_large`. For a simple cubic periodic system with unit cell dimension ``a``, clusters can be generated as follows. In the example below, additional keyword arguments have been included as they are almost always required in order to correctly identify molecular units semi-empirically. .. code-block:: python periodic_nearest_neighbors_by_atom(u, [0], 40.0, [0, 5, 10, 50], dmax=40.0, C=1.6, O=1.6) Argument descriptions can be found below. The additional keyword arguments, ``dmax``, ``C``, ``O``, are passed directly to the two body computation used to determine (semi-empirically) molecular units. Note that although molecules are computed, neighboring molecular units are determine by an atom to atom criteria. Args: uni (:class:`~exatomic.core.universe.Universe`): Universe source (int, str, list): Integer label or string symbol of source atom a (float): Cubic unit cell dimension sizes (list): List of slices to create kwargs: Additional keyword arguments to be passed to atom two body calculation Returns: dct (dict): Dictionary of sliced universes and nearest neighbor table See Also: Sliced universe construction can be facilitated by :func:`~exatomic.algorithms.neighbors.construct`. """ def sorter(group, source_atom_idxs): s = group[['atom0', 'atom1']].stack() return s[~s.isin(source_atom_idxs)].reset_index() if "label" not in uni.atom.columns: uni.atom['label'] = uni.atom.get_atom_labels() dct = defaultdict(list) grps = uni.atom.groupby("frame") ntot = len(grps) fp = FloatProgress(description="Slicing:") display(fp) for i, (fdx, atom) in enumerate(grps): if len(atom) > 0: uu = _create_super_universe(Universe(atom=atom.copy()), a) uu.compute_atom_two(**kwargs) uu.compute_molecule() if isinstance(source, (int, np.int32, np.int64)): source_atom_idxs = uu.atom[(uu.atom.index.isin([source])) & (uu.atom['prj'] == 13)].index.values elif isinstance(source, (list, tuple)): source_atom_idxs = uu.atom[uu.atom['label'].isin(source) & (uu.atom['prj'] == 13)].index.values else: source_atom_idxs = uu.atom[(uu.atom['symbol'] == source) & (uu.atom['prj'] == 13)].index.values source_molecule_idxs = uu.atom.loc[source_atom_idxs, 'molecule'].unique().astype(int) uu.atom_two['frame'] = uu.atom_two['atom0'].map(uu.atom['frame']) nearest_atoms = uu.atom_two[(uu.atom_two['atom0'].isin(source_atom_idxs)) | (uu.atom_two['atom1'].isin(source_atom_idxs))].sort_values("dr")[['frame', 'atom0', 'atom1']] nearest = nearest_atoms.groupby("frame").apply(sorter, source_atom_idxs=source_atom_idxs) del nearest['level_1'] nearest.index.names = ['frame', 'idx'] nearest.columns = ['two', 'atom'] nearest['molecule'] = nearest['atom'].map(uu.atom['molecule']) nearest = nearest[~nearest['molecule'].isin(source_molecule_idxs)] nearest = nearest.drop_duplicates('molecule', keep='first') nearest.reset_index(inplace=True) nearest['frame'] = nearest['frame'].astype(int) nearest['molecule'] = nearest['molecule'].astype(int) dct['nearest'].append(nearest) for nn in sizes: atm = [] for j, fdx in enumerate(nearest['frame'].unique()): mdxs = nearest.loc[nearest['frame'] == fdx, 'molecule'].tolist()[:nn] mdxs.append(source_molecule_idxs[j]) atm.append(uu.atom[uu.atom['molecule'].isin(mdxs)][['symbol', 'x', 'y', 'z', 'frame']].copy()) dct[nn].append(pd.concat(atm, ignore_index=True)) fp.value = i/ntot*100 dct['nearest'] = pd.concat(dct['nearest'], ignore_index=True) for nn in sizes: dct[nn] = Universe(atom=pd.concat(dct[nn], ignore_index=True)) fp.close() return dct
[docs]def periodic_nearest_neighbors_by_atom_large(uni, source, a, sizes, **kwargs): """ Determine nearest neighbor molecules to a given source (or sources) and return the data as a dataframe. Tip: This function performs the same operation as :func:`~exatomic.algorithms.neighbors.periodic_nearest_neighbors_by_atom`, but is meant for universes containing more than about 250 atoms per frame (the referenced function will be faster for smaller universes). For a simple cubic periodic system with unit cell dimension ``a``, clusters can be generated as follows. In the example below, additional keyword arguments have been included as they are almost always required in order to correctly identify molecular units semi-empirically. .. code-block:: python periodic_nearest_neighbors_by_atom_ooc(u, [0], 40.0, [0, 5, 10, 50], dmax=40.0, C=1.6, O=1.6) Argument descriptions can be found below. The additional keyword arguments, ``dmax``, ``C``, ``O``, are passed directly to the two body computation used to determine (semi-empirically) molecular units. Note that although molecules are computed, neighboring molecular units are determine by an atom to atom criteria. Args: uni (:class:`~exatomic.core.universe.Universe`): Universe source (int, str, list): Integer label or string symbol of source atom a (float): Cubic unit cell dimension sizes (iterable): List of slices to create kwargs: Additional keyword arguments to be passed to atom two body calculation Returns: dct (dict): Dictionary of sliced universes and nearest neighbor table See Also: Sliced universe construction can be facilitated by :func:`~exatomic.algorithms.neighbors.construct`. """ if "label" not in uni.atom.columns: uni.atom['label'] = uni.atom.get_atom_labels() if not isinstance(sizes, list): raise TypeError("Argument sizes must be iterable of ints.") dct = defaultdict(list) grps = uni.atom.groupby("frame") ntot = len(grps) fp = FloatProgress(description="Slicing:") display(fp) for i, (fdx, atom) in enumerate(grps): if len(atom) > 0: uu = Universe(atom=atom.copy()) uu.frame = uni.frame.loc[[fdx], :] uu.compute_atom_two(**kwargs) uu.compute_molecule() if isinstance(source, (int, np.int32, np.int64)): source_atom_idxs = [source] elif isinstance(source, np.ndarray): source_atom_idxs = source.tolist() elif isinstance(source, (list, tuple)): source_atom_idxs = uu.atom[uu.atom['label'].isin(source) | uu.atom['symbol'].isin(source)].index.astype(int).tolist() else: source_atom_idxs = uu.atom[uu.atom['symbol'] == source].index.astype(int).tolist() source_molecule_idxs = uu.atom.loc[source_atom_idxs, 'molecule'].unique().astype(int).tolist() # Identify the nearest molecules nearest_atoms = uu.atom_two[uu.atom_two['atom0'].isin(source_atom_idxs) | uu.atom_two['atom1'].isin(source_atom_idxs)].sort_values('dr')[['atom0', 'atom1']].copy() nearest_atoms['molecule0'] = nearest_atoms['atom0'].map(uu.atom['molecule']) nearest_atoms['molecule1'] = nearest_atoms['atom1'].map(uu.atom['molecule']) nearest_molecules = nearest_atoms[['molecule0', 'molecule1']].stack() nearest_molecules = nearest_molecules[~nearest_molecules.isin(source_molecule_idxs)].drop_duplicates(keep='first') # Build the appropriate universes for nn in sizes: atom1 = uu.atom.loc[uu.atom['molecule'].isin(nearest_molecules.iloc[:nn].tolist()+source_molecule_idxs), ['symbol', 'x', 'y', 'z']] adxs, x, y, z, prj = _worker(atom1.index.values.astype(int), atom1['x'].values.astype(float), atom1['y'].values.astype(float), atom1['z'].values.astype(float), a) patom = pd.DataFrame.from_dict({'atom': adxs, 'x': x, 'y': y, 'z': z, 'prj': prj}) patom['frame'] = patom['atom'].map(uu.atom['frame']) patom['symbol'] = patom['atom'].map(uu.atom['symbol']) sliced_u = Universe(atom=patom) sliced_u.compute_atom_two(dmax=a) sliced_u.compute_molecule() source_adxs1 = sliced_u.atom[(sliced_u.atom['prj'] == 13) & sliced_u.atom['atom'].isin(source_atom_idxs)].index source_mdxs1 = sliced_u.atom.loc[source_adxs1, 'molecule'].unique().tolist() nearest_atoms1 = sliced_u.atom_two[sliced_u.atom_two['atom0'].isin(source_adxs1) | sliced_u.atom_two['atom1'].isin(source_adxs1)].sort_values('dr')[['atom0', 'atom1']].copy() nearest_atoms1['molecule0'] = nearest_atoms1['atom0'].map(sliced_u.atom['molecule']) nearest_atoms1['molecule1'] = nearest_atoms1['atom1'].map(sliced_u.atom['molecule']) nearest_molecules1 = nearest_atoms1[['molecule0', 'molecule1']].stack() nearest_molecules1 = nearest_molecules1[~nearest_molecules1.isin(source_mdxs1)].drop_duplicates(keep='first') # Its fine to overwrite atom1 above since the uu.atom slice is not necessarily clustered atom1 = sliced_u.atom.loc[sliced_u.atom['molecule'].isin(nearest_molecules1.iloc[:nn].tolist()+source_mdxs1)].copy() dct[nn].append(atom1) index = nearest_molecules.index.get_level_values(0) nearest_molecules = nearest_molecules.to_frame() nearest_molecules.columns = ['molecule'] nearest_molecules['frame'] = fdx nearest_molecules['atom0'] = nearest_atoms.loc[index, 'atom0'].values nearest_molecules['atom1'] = nearest_atoms.loc[index, 'atom1'].values dct['nearest'].append(nearest_molecules) fp.value = i/ntot*100 dct['nearest'] = pd.concat(dct['nearest'], ignore_index=True) for nn in sizes: dct[nn] = Universe(atom=pd.concat(dct[nn], ignore_index=True)) fp.close() return dct