Source code for exatomic.gaussian.output

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Gaussian Output Editor
#########################
Editor classes for various types of Gaussian output files
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import re
import six
import numpy as np
import pandas as pd
from collections import defaultdict
from exatomic.exa import TypedMeta
from exatomic.exa.util.units import Length, Energy
from .editor import Editor
from exatomic.base import z2sym
from exatomic.core.frame import compute_frame_from_atom
from exatomic.core.frame import Frame
from exatomic.core.atom import Atom, Frequency
from exatomic.core.gradient import Gradient
from exatomic.core.tensor import NMRShielding
from exatomic.core.basis import (BasisSet, BasisSetOrder, Overlap,
                                 deduplicate_basis_sets)
from exatomic.core.orbital import Orbital, MOMatrix, Excitation
from exatomic.algorithms.basis import lmap, lorder
from numba import jit


@jit(nopython=True, nogil=True, cache=True)
def _triangular_indices(ncol, nbas):
    dim = nbas * (nbas + 1) // 2
    idx = np.empty((dim, 3), dtype=np.int64)
    cnt = 0
    for i in range(ncol):
        for j in range(i, nbas, ncol):
            for k in range(j, nbas):
                idx[cnt,0] = j
                idx[cnt,1] = k
                idx[cnt,2] = 0
                cnt += 1
    return idx

[docs]class GauMeta(TypedMeta): atom = Atom basis_set = BasisSet orbital = Orbital momatrix = MOMatrix basis_set_order = BasisSetOrder frame = Frame excitation = Excitation frequency = Frequency overlap = Overlap multipole = pd.DataFrame gradient = Gradient nmr_shielding = NMRShielding frequency_ext = pd.DataFrame hessian = pd.DataFrame elec_dipole = pd.DataFrame elec_quadrupole = pd.DataFrame mag_dipole = pd.DataFrame spec_prop = pd.DataFrame
[docs]class Output(six.with_metaclass(GauMeta, Editor)): def _parse_triangular_matrix(self, regex, column='coef', values_only=False): _rebas01 = r'basis functions,' found = self.find_next(_rebas01, keys_only=True) nbas = int(self[found].split()[0]) found = self.find_next(regex, keys_only=True) if not found: return ncol = len(self[found + 1].split()) start = found + 2 rmdr = nbas % ncol skips = np.array(list(reversed(range(rmdr, nbas + max(1, rmdr), ncol)))) skips = np.cumsum(skips) + np.arange(len(skips)) stop = start + skips[-1] matrix = self.pandas_dataframe(start, stop, ncol + 1, index_col=0, skiprows=skips, ).unstack().dropna().apply( lambda x: x.replace('D', 'E') ).astype(np.float64).values if values_only: return matrix idxs = _triangular_indices(ncol, nbas) return pd.DataFrame.from_dict({'chi0': idxs[:,0], 'chi1': idxs[:,1], 'frame': idxs[:,2], column: matrix}) def _get_dipole(self, length=True): _relen = "Ground to excited state transition electric dipole moments" _revel = "Ground to excited state transition velocity dipole moments" found = self.find(_relen, _revel, keys_only=True) if not found[_relen]: return key = _relen if length else _revel start = found[key][0] + 2 stop = start while self[stop].split()[0] != 'Ground': stop += 1 df = self.pandas_dataframe(start, stop, ncol=6) df.drop(0, axis=1, inplace=True) df.columns = ['x', 'y', 'z', 'dipstr', 'oscilstr'] return df
[docs] def parse_atom(self, orientation=None): ''' Parse the atom tables. Args: orientation (str, optional): Specify which orientation to parse. Will only accept `'input'` or `'standard'`. Defaults to `None` (will determine orientation to parse internally). ''' # Atom flags _regeom01 = 'Input orientation' _regeom02 = 'Standard orientation' # Find our data found = self.find(_regeom01, _regeom02, keys_only=True) if orientation is None: key = _regeom02 if found[_regeom02] else _regeom01 else: if orientation.lower() == 'input': key = _regeom01 elif orientation.lower() == 'standard': if not found[_regeom02]: msg = "Parsing of the standard orientation was " \ +"requested, but there were no atom tables " \ +"found. Will parse the input orientations." warnings.warn(msg, Warning) key = _regeom01 else: key = _regeom02 else: msg = "Could not understand the requested " \ +"orientation to parse. Expected " \ +"'Standard' or 'Input', but got {}." raise ValueError(msg.format(orientation)) starts = np.array(found[key]) + 5 # Prints converged geometry twice but only need it once starts = starts[:-1] if len(starts) > 1 else starts stop = starts[0] # Find where the data stops while '-------' not in self[stop]: stop += 1 # But it should be same sized array each time stops = starts + (stop - starts[0]) dfs = [] # Iterate over frames for i, (start, stop) in enumerate(zip(starts, stops)): atom = self.pandas_dataframe(start, stop, 6) atom['frame'] = i dfs.append(atom) atom = pd.concat(dfs).reset_index(drop=True) # Drop the column of atomic type (whatever that is) atom.drop([2], axis=1, inplace=True) # Name the data atom.columns = ['set', 'Z', 'x', 'y', 'z', 'frame'] # Zero-based indexing atom['set'] -= 1 # Convert to atomic units atom['x'] *= Length['Angstrom', 'au'] atom['y'] *= Length['Angstrom', 'au'] atom['z'] *= Length['Angstrom', 'au'] # Map atomic symbols onto Z numbers atom['symbol'] = atom['Z'].map(z2sym) self.atom = atom
[docs] def parse_basis_set(self): # Basis flags _rebas02 = 'AO basis set in the form of general basis input' _rebas03 = ' (Standard|General) basis' _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ', 'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '} _rebaspat = re.compile('|'.join(_basrep.keys())) # Find the basis set found = self.regex(_rebas02, _rebas03, keys_only=True) if not found[_rebas02]: return start = stop = found[_rebas02][0] + 1 while self[stop].strip(): stop += 1 # Raw data df = self.pandas_dataframe(start, stop, 4) def _padx(srs): return [0] + srs.tolist() + [df.shape[0]] # Get some indices for appropriate columns setdx = _padx(df[0][df[0] == '****'].index) shldx = _padx(df[3][~np.isnan(df[3])].index) lindx = df[0][df[0].str.lower().isin(lorder + ['sp'])] # Populate the df df['L'] = lindx.str.lower().map(lmap) df['L'] = df['L'].fillna(method='ffill').fillna( method='bfill').astype(np.int64) df['center'] = np.concatenate([np.repeat(i, stop - start) for i, (start, stop) in enumerate(zip(setdx, setdx[1:]))]) df['shell'] = np.concatenate([np.repeat(i-1, stop - start) for i, (start, stop) in enumerate(zip(shldx, shldx[1:]))]) # Complicated way to get shells but it is flat maxshl = df.groupby('center').apply(lambda x: x.shell.max() + 1) maxshl.index += 1 maxshl[0] = 0 df['shell'] = df['shell'] - df['center'].map(maxshl) # Drop all the garbage todrop = setdx[:-1] + [i+1 for i in setdx[:-2]] + lindx.index.tolist() df.drop(todrop, inplace=True) # Keep cleaning if df[0].dtype == 'object': df[0] = df[0].str.replace('D', 'E').astype(np.float64) if df[1].dtype == 'object': df[1] = df[1].str.replace('D', 'E').astype(np.float64) try: sp = np.isnan(df[2]).sum() == df.shape[0] except TypeError: df[2] = df[2].str.replace('D', 'E').astype(np.float64) sp = True df.rename(columns={0: 'alpha', 1: 'd'}, inplace=True) # Deduplicate basis sets and expand 'SP' shells if present df, setmap = deduplicate_basis_sets(df, sp=sp) spherical = '5D' in self[found[_rebas03][0]] if df['L'].max() < 2: spherical = True self.basis_set = BasisSet(df) self.meta['spherical'] = spherical self.atom['set'] = self.atom['set'].map(setmap)
[docs] def parse_orbital(self): _repop = 'Population analysis using the SCF density' start_orb = self.find(_repop, keys_only=True) start_orb = start_orb[-1] _rebas01 = r'basis functions,' # Orbital flags _realphaelec = 'alpha electrons' _reorb01 = '(?=Alpha|Beta).*(?=occ|virt)' _reorb02 = 'Orbital symmetries' _orbslice = [slice(10 * i, 10 * i + 9) for i in range(5)] _symrep = {'Occupied': '', 'Virtual': '', 'Alpha Orbitals:': '', 'Beta Orbitals:': '', '\(': '', '\)': ''} _resympat = re.compile('|'.join(_symrep.keys())) _symrep['('] = '' _symrep[')'] = '' # Find where our data is found = self.regex(_reorb01, _reorb02, _rebas01, _realphaelec, start=start_orb) # If no orbital energies, quit if not found[_reorb01]: return # Check if open shell os = any(('Beta' in ln for lno, ln in found[_reorb01])) #UNUSED? #occ = 1 if os else 2 # Find number of electrons ae, x, x, be, x, x = self.regex(_realphaelec)[0][1].split() ae, be = int(ae), int(be) # Get orbital energies ens = '\n'.join([ln.split('-- ')[1] for i, ln in found[_reorb01]]) ens = pd.read_fwf(six.StringIO(ens), header=None, widths=np.repeat(10, 5)).stack().values # Other arrays orbital = Orbital.from_energies(ens, ae, be, os=os) # Symmetry labels if found[_reorb02]: # Gaussian seems to print out a lot of these blocks # maybe a better way to deal with this allsyms = [] match = ['(', 'Orbitals'] for i, (start, _) in enumerate(found[_reorb02]): start += start_orb # Find the start, stop indices for each block while match[0] not in self[start]: start += 1 stop = start + 1 while any((i in self[stop] for i in match)): stop += 1 # Clean up the text block so it is just symmetries syms = _resympat.sub(lambda m: _symrep[m.group(0)], ' '.join([i.strip() for i in self[start:stop]])).split() # cat the syms for each block together allsyms += syms # Add it to our dataframe orbital['symmetry'] = allsyms[-orbital.shape[0]:] self.orbital = orbital
[docs] def parse_momatrix(self): """ Parses the MO matrix if asked for in the input. Note: Requires specification of pop(full) or pop(no) or the like. """ if hasattr(self, '_momatrix'): return _rebas01 = r'basis functions,' # MOMatrix flags _remomat01 = r'pop.*(?=full|no)' _remomat02 = 'Orbital Coefficients' _basrep = {'D 0': 'D0 ', 'F 0': 'F0 ', 'G 0': 'G0 ', 'H 0': 'H0 ', 'I 0': 'I0 '} _rebaspat = re.compile('|'.join(_basrep.keys())) # Check if a full MO matrix was specified in the input check = self.regex(_remomat01, stop=1000, flags=re.IGNORECASE) if not check: return # Find approximately where our data is found = self.find(_remomat02, _rebas01) # Get some dimensions ndim = len(found[_remomat02]) # If something goes wrong if not ndim: return nbas = int(found[_rebas01][0][1].split()[0]) nblocks = np.int64(np.ceil(nbas / 5)) # Allocate a big ol' array coefs = np.empty((nbas ** 2, ndim), dtype=np.float64) # Dynamic column generation hasn't been worked out yet colnames = ['coef'] + ['coef' + str(i) for i in range(1, ndim)] # Iterate over where the data was found # c counts the column in the resulting momatrix table _csv_args = {'delim_whitespace': True, 'header': None} for c, (lno, _) in enumerate(found[_remomat02]): gap = 0 while not 'eigenvalues' in self[lno + gap].lower(): gap += 1 start = lno + gap + 1 stop = start + nbas # The basis set order is printed with every chunk of eigenvectors if not c: mapr = self.basis_set.groupby(['set', 'L']).apply( lambda x: x['shell'].unique()).to_dict() self.basis_set_order = _basis_set_order(self[start:stop], mapr, self.atom['set']) # Some fudge factors due to extra lines being printed space = start - lno - 1 fnbas = nbas + space span = start + fnbas * nblocks # Finally get where our chunks are starts = np.arange(start, span, fnbas) stops = np.arange(stop, span, fnbas) stride = 0 # b counts the blocks of eigenvectors per column in momatrix for b, (start, stop) in enumerate(zip(starts, stops)): # Number of eigenvectors in this block ncol = len(self[start][21:].split()) step = nbas * ncol _csv_args['names'] = range(ncol) # Massage the text so that we can read csv block = '\n'.join([ln[21:] for ln in self[start:stop]]) block = _rebaspat.sub(lambda m: _basrep[m.group(0)], block) # Enplacen the resultant unstacked values coefs[stride:stride + nbas * ncol, c] = pd.read_fwf( six.StringIO(block), header=None, widths=np.repeat(10, 5)).unstack().dropna().values stride += step # Index chi, phi chis = np.tile(range(nbas), nbas) orbs = np.repeat(range(nbas), nbas) momatrix = pd.DataFrame(coefs, columns=colnames) momatrix['chi'] = chis momatrix['orbital'] = orbs # Frame not really implemented for momatrix momatrix['frame'] = 0 self.momatrix = momatrix
[docs] def parse_basis_set_order(self): if hasattr(self, '_basis_set_order'): return self.parse_momatrix()
[docs] def parse_frame(self): # Frame flags _retoten = 'SCF Done:' _realphaelec = 'alpha electrons' _reelecstate = 'The electronic state' # Get the default frame from the atom table self.frame = compute_frame_from_atom(self.atom) # Find our data found = self.find(_retoten, _realphaelec, _reelecstate) # Extract just the total SCF energies ens = [float(ln.split()[4]) for lno, ln in found[_retoten]] # If 'SCF Done' prints out more times than frames try: ens = ens if len(self.frame) == len(ens) else ens[-len(self.frame):] self.frame['E_tot'] = ens except ValueError: pass # We will assume number of electrons doesn't change per frame ae, x, x, be, x, x = found[_realphaelec][0][1].split() self.frame['N_e'] = int(ae) + int(be) self.frame['N_a'] = int(ae) self.frame['N_b'] = int(be) # Try to get the electronic state but don't try too hard try: states = [] #for lno, ln in found[_reelecstate]: for _, ln in found[_reelecstate]: if 'initial' in ln: continue states.append(ln.split()[4].replace('.', '')) self.frame['state'] = states except (IndexError, ValueError): pass
[docs] def parse_excitation(self): # TDDFT flags _retddft = 'TD' _reexcst = 'Excited State' chk = self.find(_retddft, keys_only=True) if not chk: return # Find the data found = self.find(_reexcst) keeps, maps, summ = [], [] ,[] for i, (lno, ln) in enumerate(found): summ.append(ln) lno += 1 while '->' in self[lno]: keeps.append(lno) maps.append(i) lno += 1 cols = [0, 1, 2, 'kind', 'eV', 3, 'nm', 4, 'osc', 's2'] summ = pd.read_csv(six.StringIO('\n'.join([ln for lno, ln in found])), delim_whitespace=True, header=None, names=cols, usecols=[c for c in cols if type(c) == str]) summ['s2'] = summ['s2'].str[7:].astype(np.float64) summ['osc'] = summ['osc'].str[2:].astype(np.float64) cols = ['occ', 0, 'virt', 'cont'] conts = pd.read_csv(six.StringIO('\n'.join([self[i] for i in keeps])), delim_whitespace=True, header=None, names=cols, usecols=[c for c in cols if isinstance(c, str)]) conts['map'] = maps for col in summ.columns: conts[col] = conts['map'].map(summ[col]) conts['energy'] = conts['eV'] * Energy['eV', 'Ha'] conts['frame'] = conts['group'] = 0 self.excitation = conts
[docs] def parse_elec_dipole(self, length=True): ''' Parse the electric dipole moment. Args: length (bool, optional): Parse the dipoles in the length formalism. Defaults to `True`. ''' df = self._get_dipole(length=length) self.elec_dipole = df
[docs] def parse_elec_quadrupole(self): ''' Parse the transition velocity quadrupole moment. ''' _requad = "Ground to excited state transition velocity quadrupole moments" found = self.find(_requad, keys_only=True) if not found: return start = found[0] + 2 stop = start while self[stop].split()[0] != r'<0|del|b>': stop += 1 df = self.pandas_dataframe(start, stop, ncol=7) df.drop(0, axis=1, inplace=True) cols = ['xx', 'yy', 'zz', 'xy', 'xz', 'yz'] df.columns = cols self.elec_quadrupole = df
[docs] def parse_mag_dipole(self): ''' Parse the magnetic dipole moment. ''' _remag = "Ground to excited state transition magnetic dipole moments" found = self.find(_remag, keys_only=True) if not found: return start = found[0] + 2 stop = start while self[stop].split()[0] != 'Ground': stop += 1 df = self.pandas_dataframe(start, stop, ncol=4) df.drop(0, axis=1, inplace=True) df.columns = ['x', 'y', 'z'] self.mag_dipole = df
[docs] def parse_spec_prop(self): ''' Parse the spectral properties. ''' _rerotstr = r"Rotatory Strengths (R) in cgs (10**-40 erg-esu-cm/Gauss)" found = self.find(_rerotstr, keys_only=True) if not found: return starts = np.array(found) + 2 stop = starts[0] while self[stop].split()[0] != r'1/2[<0|r|b>*<b|rxdel|0>': stop += 1 stops = starts + (stop - starts[0]) srs = [] for idx, (start, stop) in enumerate(zip(starts, stops)): df = self.pandas_dataframe(start, stop, ncol=6) name = 'rotstr_len' if idx == 1 else 'rotstr_vel' sr = pd.Series(df[4].values, name=name) srs.append(sr) df = self._get_dipole(True) dip_len = pd.Series(df['dipstr'].values, name='dipstr_len') df = self._get_dipole(False) dip_vel = pd.Series(df['dipstr'].values, name='dipstr_vel') srs.append(dip_len) srs.append(dip_vel) if not hasattr(self, 'excitation'): self.parse_excitation() exc = self.excitation.groupby('map').apply(lambda x: x.iloc[0]['energy']) exc = pd.Series(exc.values, name='energy_au') df = pd.concat([exc]+srs, axis=1) self.spec_prop = df
[docs] def parse_frequency(self): # check to see if HPModes is being used in the input _rehpmodes = 'HPModes' _reharm = 'Harmonic frequencies' found_hp = self.regex(_rehpmodes, ignore_case=True) # TODO: look into effect of the next huge assumption # test with an ROA calculation found_ha = self.regex(_reharm, keys_only=True) if not found_ha: return # if it is found then we get the location of the _reharm labels # this gives us the location of the different precision data types start_read = found_ha[0] if found_hp: print("Parsing frequency normal modes from HPModes output") hpmodes = True stop_read = found_ha[1] else: stop_read = None hpmodes = False # Frequency flags _refreq = 'Frequencies' # we set the start and stop condition because HPModes will print the displacement # data twice, once with high precision and once with normal precision found = self.regex(_refreq, start=start_read, stop=stop_read, keys_only=True) if not found: return # set the start point where the matches were found and add the starting point starts = np.array(found) + start_read # get the location of the Atom labels in the frequency blocks # the line below this is where all of the normal mode displacement data begins found_atom = np.array(self.regex('Atom', start=start_read, stop=stop_read, keys_only=True)) + start_read try: # Total lines per block minus the unnecessary ones span = starts[1] - found_atom[0] - 3 except IndexError: start = found_atom[0]+1 span = 0 while self[start].strip() and self[start].split()[0] != 'Harmonic': span += 1 start += 1 # get the number of other attributes included # this is done so we can have some flexibility in the code as there may be times that # the number of added attributes is not always the same and we get errors span_freq_vals = found_atom[0] - starts[0] + 1 dfs, fdx = [], 0 # set a bunch of conditional parameters if hpmodes is activated # saves us from putting if statements in the for loop freq_start = 24 if hpmodes else 15 norm_mode = 1 if hpmodes else 3 extra_col = 3 if hpmodes else 2 # Iterate over what we found for lno in starts: ln = self[lno] # Get the frequencies first freqs = ln[freq_start:].split() # get the ir intensities r_mass = list(map(float, self[lno+1][freq_start:].split())) f_cons = list(map(float, self[lno+2][freq_start:].split())) ir_int = list(map(float, self[lno+3][freq_start:].split())) nfreqs = len(freqs) # Get just the atom displacement vectors start = lno + span_freq_vals stop = start + span # we use the conditional parameters we set before to correctly splice all the data cols = range(extra_col + norm_mode * nfreqs) df = self.pandas_dataframe(start, stop, ncol=cols) # cannot get rid of this one conditional if hpmodes: # get the displacements dx, dy, dz = df.groupby(0).apply(lambda x: x[np.arange(3,nfreqs+3)].unstack().values).values # Generate the appropriate dimensions of other columns labels = np.tile(df[1].drop_duplicates().values, nfreqs) zs = np.tile(df.groupby(0).get_group(1)[2].values, nfreqs) # we use df.shape[0]/3 because the HPModes prints all of the coordinates as a single row # so it will be the number_of_atoms * 3 freqdxs = np.repeat(range(fdx, fdx + nfreqs), int(df.shape[0]/3)) freqs = np.repeat(freqs, int(df.shape[0]/3)) ir_int = np.repeat(ir_int, int(df.shape[0]/3)) r_mass = np.repeat(r_mass, int(df.shape[0]/3)) f_cons = np.repeat(f_cons, int(df.shape[0]/3)) else: # Split up the df and unstack it slices = [list(range(2 + i, 2 + 3 * nfreqs, 3)) for i in range(nfreqs)] dx, dy, dz = [df[i].unstack().values for i in slices] # Generate the appropriate dimensions of other columns labels = np.tile(df[0].values, nfreqs) zs = np.tile(df[1].values, nfreqs) freqdxs = np.repeat(range(fdx, fdx + nfreqs), df.shape[0]) freqs = np.repeat(freqs, df.shape[0]) ir_int = np.repeat(ir_int, df.shape[0]) r_mass = np.repeat(r_mass, df.shape[0]) f_cons = np.repeat(f_cons, df.shape[0]) fdx += nfreqs # Put it all together stacked = pd.DataFrame.from_dict({'Z': zs, 'label': labels, 'dx': dx, 'dy': dy, 'dz': dz, 'frequency': freqs, 'freqdx': freqdxs, 'ir_int': ir_int, 'r_mass': r_mass, 'f_const': f_cons}) stacked['symbol'] = stacked['Z'].map(z2sym) dfs.append(stacked) # Now put all our frequencies together frequency = pd.concat(dfs).reset_index(drop=True) frequency['frequency'] = frequency['frequency'].astype(np.float64) # Frame not really implemented here either frequency['frame'] = 0 # generate the frequency and extended frequency dataframes self.frequency = frequency
[docs] def parse_nmr_shielding(self): # nmr shielding regex # this might lead to wrong behavior so it may need to change _renmr = "SCF GIAO Magnetic shielding tensor (ppm):" _reiso = "Isotropic" found = self.find(_renmr) if not found: return found = self.find(_reiso) # base properties if not hasattr(self, 'atom'): self.parse_atom() atom = self.atom['set'].drop_duplicates().values nat = len(atom) symbols = self.atom.groupby('frame').get_group(0)['symbol'] df = [] for f in found: # get line value +1 for the tensors start = f[0]+1 # set end value end = start + 3 split_str = f[1].strip().split() # get the isotropic value iso = float(split_str[4]) # get the anisotropic value aniso = float(split_str[-1]) # get the tensor elements tmp = self.pandas_dataframe(start, end, ncol=6) tmp.drop([0,2,4], axis='columns', inplace=True) # create a temporary dataframe tmp = pd.DataFrame(tmp.unstack().values.reshape(1,9), columns=['xx','xy','xz','yx','yy','yz','zx','zy','zz']) tmp['isotropic'] = iso tmp['anisotropic'] = aniso # append to array of df's df.append(tmp) # create dataframe shielding = pd.concat(df, ignore_index=True) shielding['atom'] = atom shielding['symbol'] = symbols shielding['label'] = 'nmr shielding' shielding['frame'] = np.tile(0, nat) # write to class attribute self.nmr_shielding = shielding
[docs] def parse_gradient(self): # gradient regex flags _regradient = "Forces (Hartrees/Bohr)" # find data found = self.find(_regradient, keys_only=True) if not found: return # Gaussian prints the gradients in the # input orientation self.parse_atom(orientation='input') print("Parsing the atom table in the input " \ +"orientation as the gradient is printed " \ +"using this molecular orientation.") # set start point starts = np.array(found) + 3 stop = starts[0] while '-------' not in self[stop]: stop += 1 stops = starts + (stop - starts[0]) dfs = [] for i, (start, stop) in enumerate(zip(starts, stops)): grad = self.pandas_dataframe(start, stop, 5) grad['frame'] = i dfs.append(grad) grad = pd.concat(dfs, ignore_index=True) grad.columns = ['atom', 'Z', 'fx', 'fy', 'fz', 'frame'] # not sure why this should be here but it make it so it agrees with other # codes when using the va calculations grad[['fx', 'fy', 'fz']] *= -1.0 grad['atom'] -= 1 grad['symbol'] = grad['Z'].map(z2sym) self.gradient = grad
# Below are triangular matrices -- One electron integrals
[docs] def parse_overlap(self): _reovl01 = '*** Overlap ***' overlap = self._parse_triangular_matrix(_reovl01, 'coef') if overlap is not None: self.overlap = overlap
[docs] def parse_multipole(self): _reixn = 'IX= {}' mltpl = self._parse_triangular_matrix(_reixn.format(1), 'ix1') if mltpl is not None: mltpl['ix2'] = self._parse_triangular_matrix(_reixn.format(2), 'ix2', True) mltpl['ix3'] = self._parse_triangular_matrix(_reixn.format(3), 'ix3', True) self.multipole = mltpl
[docs] def parse_hessian(self): ''' Parse the Hessian from the Gaussian stdout. This is printed as a lower triangular matrix. We convert this to a square matrix internally. ''' _rehess = 'Force constants in Cartesian coordinates:' found = self.find(_rehess, keys_only=True) if not found: return # Gaussian prints the hessian in the # input orientation self.parse_atom(orientation='input') print("Parsing the atom table in the input " \ +"orientation as the Hessian is printed " \ +"using this molecular orientation.") ldx = found[0] + 1 dfs = [] srs = [] srs_old = [] try: while float(self[ldx].split()[0]): d = self[ldx].split() if 'D' not in d[1]: ncols = len(d) cols = list(map(lambda x: int(x)-1, d)) if cols[0] != 0: srs_old = srs dfs.append(pd.concat(srs_old, axis=1).T) srs = [] else: arr = np.zeros(ncols) d = [x.replace('D', 'E') for x in d] arr[range(len(d)-1)] = list(map(float, d[1:])) sr = pd.Series(arr, index=cols, name=int(d[0])-1) srs.append(sr) ldx += 1 except ValueError: dfs.append(pd.concat(srs, axis=1).T) df = pd.concat(dfs, axis=1) df.fillna(0, inplace=True) df = df + (df.T - np.eye(df.shape[0])*df) self.hessian = df
def __init__(self, *args, **kwargs): super(Output, self).__init__(*args, **kwargs)
[docs]class Fchk(six.with_metaclass(GauMeta, Editor)): # set a minimum tolerance for displayed values _tol = 1e-8 def _intme(self, fitem, idx=0): """Helper gets an integer of interest.""" return int(self[fitem[idx]].split()[-1]) def _dfme(self, fitem, dim, idx=0): """Helper gets an array of interest.""" start = fitem[idx] + 1 col = min(len(self[start].split()), dim) stop = np.ceil(start + dim / col).astype(np.int64) return self.pandas_dataframe(start, stop, col).stack().values def _frequency_ext(self): ''' Parses extended frequency data present in FChk file Note: Requires Freq(SaveNormalModes) in input ''' # Frequency regex _renmode = 'Number of Normal Modes' _refinfo = 'Vib-E2' # TODO: find out what these two values are useful for _rendim = 'Vib-NDim' _rendim0 = 'Vib-NDim0' found = self.find(_renmode) if not found: return else: found = self.find(_renmode, _refinfo, keys_only=True) # find number of vibrational modes nmode = self._intme(found[_renmode]) # get extended data (given by mapper array) all_info = self._dfme(found[_refinfo], nmode * 14) all_info[abs(all_info) < self._tol] = 0 freqdx = [i for i in range(nmode)] # mapper array to filter through all of the values printed on the fchk file # TODO: have to find labels of unknown columns mapper = ["freq", "r_mass", "f_const", "ir_int", "raman_activity", "depol_plane", "depol_unpol", "dipole_s", "rot_s", "unk4", "unk5", "unk6", "unk7", "em_angle"] # build extended frequency table df = pd.DataFrame.from_dict({mapper[int(i/nmode)]: all_info[i:i+nmode] for i in range(0, nmode*14-1, nmode)}) df['freqdx'] = freqdx return df
[docs] def parse_atom(self): # Atom regex _renat = 'Number of atoms' _reznum = 'Atomic numbers' _rezeff = 'Nuclear charges' _reposition = 'Current cartesian coordinates' # Find line numbers of interest found = self.find(_renat, _reznum, _rezeff, _reposition, stop=100, keys_only=True) # Number of atoms in current geometry nat = self._intme(found[_renat]) # Atom identifiers znums = self._dfme(found[_reznum], nat) # Atomic symbols symbols = list(map(lambda x: z2sym[x], znums)) # Z effective if ECPs are used zeffs = self._dfme(found[_rezeff], nat).astype(np.int64) # Atomic positions pos = self._dfme(found[_reposition], nat * 3).reshape(nat, 3) frame = np.zeros(len(symbols), dtype=np.int64) self.atom = pd.DataFrame.from_dict({'symbol': symbols, 'Zeff': zeffs, 'frame': frame, 'x': pos[:,0], 'y': pos[:,1], 'z': pos[:,2], 'set': range(1, len(symbols) + 1)})
[docs] def parse_basis_set(self): # Basis set regex _rebasdim = 'Number of basis functions' _recontdim = 'Number of contracted shells' _reprimdim = 'Number of primitive shells' _reshelltype = 'Shell types' _reprimpershell = 'Number of primitives per shell' _reshelltoatom = 'Shell to atom map' _reprimexp = 'Primitive exponents' _recontcoef = 'Contraction coefficients' _repcontcoef = 'P(S=P) Contraction coefficients' found = self.find(_rebasdim, _reshelltype, _reprimpershell, _reshelltoatom, _reprimexp, _recontcoef, _repcontcoef, keys_only=True) # Number of basis functions - UNUSED #nbas = self._intme(found[_rebasdim]) # Number of 'shell to atom' mappings dim1 = self._intme(found[_reshelltype]) # Number of primitive exponents dim2 = self._intme(found[_reprimexp]) # Handle cartesian vs. spherical here # only spherical for now shelltypes = self._dfme(found[_reshelltype], dim1).astype(np.int64) primpershell = self._dfme(found[_reprimpershell], dim1).astype(np.int64) shelltoatom = self._dfme(found[_reshelltoatom], dim1).astype(np.int64) primexps = self._dfme(found[_reprimexp], dim2) contcoefs = self._dfme(found[_recontcoef], dim2) if found[_repcontcoef]: pcontcoefs = self._dfme(found[_repcontcoef], dim2) # Keep track of some things ptr, prevatom, sp = 0, 0, False # Temporary storage of basis set data shldx = defaultdict(int) ddict = defaultdict(list) for atom, nprim, shelltype in zip(shelltoatom, primpershell, shelltypes): if atom != prevatom: prevatom, shldx = atom, defaultdict(int) # Collect the data for this basis set if shelltype == -1: shelltype, sp = 0, True L = np.abs(shelltype) step = ptr + nprim ddict[1].extend(contcoefs[ptr:step].tolist()) ddict[0].extend(primexps[ptr:step].tolist()) ddict['center'].extend([atom] * nprim) ddict['shell'].extend([shldx[L]] * nprim) ddict['L'].extend([L] * nprim) shldx[L] += 1 if sp: shldx[1] = shldx[0] + 1 ddict[1].extend(pcontcoefs[ptr:step].tolist()) ddict[0].extend(primexps[ptr:step].tolist()) ddict['center'].extend([atom] * nprim) ddict['shell'].extend([shldx[1]] * nprim) ddict['L'].extend([1] * nprim) shldx[1] += 1 ptr += nprim sp = False df = pd.DataFrame.from_dict(ddict) df.rename(columns={0: 'alpha', 1: 'd'}, inplace=True) sets, setmap = deduplicate_basis_sets(df) self.basis_set = sets self.meta['spherical'] = True self.atom['set'] = self.atom['set'].map(setmap)
# cnts = {key: -1 for key in range(10)} # pcen, pl, pn, shfns = 0, 0, 1, [] # for cen, n, l, seht in zip(df['center'], df['N'], df['L'], # df['center'].map(sets)): # if not pcen == cen: cnts = {key: -1 for key in range(10)} # if (pl != l) or (pn != n) or (pcen != cen): cnts[l] += 1 # shfns.append(mapr[(seht, l)][cnts[l]]) # pcen, pl, pn = cen, l, n # df['shell'] = shfns
[docs] def parse_basis_set_order(self): # Unique basis sets sets = self.basis_set.groupby('set') data = [] # Gaussian orders basis functions strangely # Will likely need an additional mapping for cartesian lamp = {0: [0], 1: [1, -1, 0], 2: [0, 1, -1, 2, -2], 3: [0, 1, -1, 2, -2, 3, -3], 4: [0, 1, -1, 2, -2, 3, -3, 4, -4], 5: [0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5]} # What was tag column for in basis set order? key = 'tag' if 'tag' in self.atom.columns else 'symbol' # Iterate over atoms for cent, bset, tag in zip(self.atom.index.values, self.atom['set'], self.atom[key]): seht = sets.get_group(bset) # Iterate over basis set pL, psh = -1, -1 for L, sh in zip(seht['L'], seht['shell']): if (pL == L) and (psh == sh): continue for ml in lamp[L]: data.append((tag, cent, L, ml, sh, 0)) pL = L psh = sh columns = ('tag', 'center', 'L', 'ml', 'shell', 'frame') self.basis_set_order = pd.DataFrame(data, columns=columns)
[docs] def parse_momatrix(self): # MOMatrix regex _rebasdim = 'Number of basis functions' _reindepdim = 'Number of independant functions' _realphaen = 'Alpha Orbital Energies' _reamomatrix = 'Alpha MO coefficients' _rebmomatrix = 'Beta MO coefficients' found = self.find(_rebasdim, _reindepdim, _reamomatrix, _rebmomatrix, keys_only=True) # Again number of basis functions nbas = self._intme(found[_rebasdim]) try: ninp = self._intme(found[_reindepdim]) except IndexError: ninp = nbas ncoef = self._intme(found[_reamomatrix]) # Alpha or closed shell MO coefficients coefs = self._dfme(found[_reamomatrix], ncoef) # Beta MO coefficients if they exist bcoefs = self._dfme(found[_rebmomatrix], ncoef) \ if found[_rebmomatrix] else None # Indexing chis = np.tile(range(nbas), ninp) orbitals = np.repeat(range(ninp), nbas) frame = np.zeros(ncoef, dtype=np.int64) self.momatrix = pd.DataFrame.from_dict({'chi': chis, 'orbital': orbitals, 'coef': coefs, 'frame': frame}) if bcoefs is not None: self.momatrix['coef1'] = bcoefs
[docs] def parse_frequency(self): ''' Parses frequency data from FChk file Note: Requires Freq(SaveNormalModes) in input ''' # Frequency regex _renmode = 'Number of Normal Modes' _redisp = 'Vib-Modes' _refinfo = 'Vib-E2' found = self.find(_renmode) if not found: return else: found = self.find(_renmode, _refinfo, _redisp, keys_only=True) ext_params = self._frequency_ext() # get atomic numbers znums = self.atom['Zeff'].values # get number of atoms nat = len(znums) # TODO: will the order of the atoms and their frequencies always be the same? # find number of vibrational modes nmode = self._intme(found[_renmode]) # get extended data (given by mapper array) all_info = self._dfme(found[_refinfo], nmode * 14) freq = all_info[:nmode] # get frequency mode displacements disp = self._dfme(found[_redisp], nat * nmode * 3) disp[abs(disp) < self._tol] = 0 # unstack column vector to displacement along each cartesian direction dx = disp[::3] dy = disp[1::3] dz = disp[2::3] # extend each property to have the same size freqdx = [i for i in range(len(freq))] label = [i for i in range(len(znums))] freq = np.repeat(freq, nat) freqdx = np.repeat(freqdx, nat) znums = np.tile(znums, nmode) label = np.tile(label, nmode) symbols = list(map(lambda x: z2sym[x], znums)) # just to have the same table as the one generated for normal output parser frame = np.zeros(len(znums)).astype(np.int) # build frequency table df = pd.DataFrame.from_dict({"Z": znums, "label": label, "dx": dx, "dy": dy, "dz": dz, "frequency": freq, "freqdx": freqdx, "symbol": symbols, "frame": frame}) cols = list(map(lambda x: 'unk' not in x and 'freq' not in x, ext_params.columns)) cols = ext_params.columns[cols] for col in cols: df[col] = np.repeat(ext_params[col].values, nat) self.frequency = df
[docs] def parse_gradient(self): # gradient regex _regrad = 'Cartesian Gradient' _reznums = 'Atomic numbers' _renat = 'Number of atoms' found = self.find(_regrad, _reznums, _renat, keys_only=True) if not found[_regrad]: return # get number of atoms nat = self._intme(found[_renat]) # get atomic numbers znums = self._dfme(found[_reznums], nat) # generate symbols symbols = list(map(lambda x: z2sym[x], znums)) # generate labels label = [i for i in range(len(znums))] ngrad = nat * 3 # get gradients(Ha/Bohr) grad = self._dfme(found[_regrad], ngrad) grad[abs(grad) < self._tol] = 0 # get x, y, z gradients fx = grad[::3] fy = grad[1::3] fz = grad[2::3] nframes = int(len(fx)/nat) # generate frame labels frame = [i for i in range(nframes)] # extend to match sizes of all arrays frame = np.repeat(frame, nat) label = np.tile(label, nframes) znums = np.tile(znums, nframes) symbols = np.tile(symbols, nframes) # create dataframe self.gradient = pd.DataFrame.from_dict({"Z": znums, "atom": label, "fx": fx, "fy": fy, "fz": fz, "symbol": symbols, "frame": frame}) self.gradient['Z'] = self.gradient['Z'].astype(np.int64)
[docs] def parse_nmr_shielding(self): # nmr shielding regex _renmr = 'NMR shielding' # base properties _renat = 'Number of atoms' _reznums = 'Atomic numbers' found = self.find(_renmr) if not found: return else: found = self.find(_renmr, _renat, _reznums, keys_only=True) # get number of atoms nat = self._intme(found[_renat]) # get atomic numbers znums = self._dfme(found[_reznums], nat) # generate labels label = [i for i in range(len(znums))] # get NMR shielding tensors (Hz) # TODO: Check that this is in fact in Hz shield = self._dfme(found[_renmr], nat * 9) shield[abs(shield) < self._tol] = 0 # arrange into x, y, z cols x = shield[::3] y = shield[1::3] z = shield[2::3] # # conversion from Hz to ppm # # (only done for some test calculations may not be accurate for all) # x *= 1e6/18778.86 # y *= 1e6/18778.86 # z *= 1e6/18778.86 matrix = np.transpose([x,y,z]).reshape(nat,3,3) matrix = np.asarray([0.5*(mat + np.transpose(mat)) for mat in matrix]) # compute isotropic and anisotropic values # done in units of Hz iso = np.zeros(len(matrix)) aniso = np.zeros(len(matrix)) # TODO: check when we get complex eigenvalues for idx, mat in enumerate(matrix): vals = np.linalg.eigvals(mat) vals.sort() iso[idx] = np.average(vals, axis=-1) aniso[idx] = vals[2] - (vals[0] + vals[1])/2 matrix = matrix.reshape(len(matrix)*3, -1) df = pd.DataFrame(matrix, columns=['x', 'y', 'z']) label = 'nmr shielding' # get atom center indexes atom = [i for i in range(nat)] # get atom symbols nframes = int(len(x)/(3*nat)) symbols = np.tile([z2sym[z] for z in znums], nframes) # get frame indexes frame = [i for i in range(nframes)] # extend arrays to match size frame = np.repeat(frame, nat) label = np.repeat(label, nat) atom = np.tile(atom, nframes) df['grp'] = [i for i in range(nframes * nat) for j in range(3)] df = pd.DataFrame(df.groupby('grp').apply(lambda x: x.unstack().values[:-3]).values.tolist(), columns=['xx','xy','xz','yx','yy','yz','zx','zy','zz']) df['atom'] = atom.astype(np.int64) df['symbol'] = symbols df['label'] = label df['frame'] = frame.astype(np.int64) df['isotropic'] = iso df['anisotropy'] = aniso # TODO: must make a conditional so it can detect if a tensor object already exists and # concat the two dataframes # will also have to consider in core.tensor.add_tensor self.nmr_shielding = df
[docs] def parse_orbital(self): # Orbital regex _reorboc = 'Number of .*electrons' _reorben = 'Orbital Energies' found = self.regex(_reorben, _reorboc, keys_only=True) ae = self._intme(found[_reorboc], idx=1) be = self._intme(found[_reorboc], idx=2) nbas = self._intme(found[_reorben]) ens = np.concatenate([self._dfme(found[_reorben], nbas, idx=i) for i, start in enumerate(found[_reorben])]) os = nbas != len(ens) self.orbital = Orbital.from_energies(ens, ae, be, os=os)
[docs] def parse_hessian(self): ''' Parse the Hessian from the Gaussian stdout. This is printed as a lower triangular matrix. We convert this to a square matrix internally. ''' _rehess = 'Cartesian Force Constants' found = self.find(_rehess, keys_only=True) if not found: return if not hasattr(self, 'atom'): self.parse_atom() nelem = self._intme(found) hess_elem = self._dfme(found, nelem) nat = self.atom.shape[0] arr = np.zeros((3*nat, 3*nat)) tril_idx = np.tril_indices_from(arr, k=0) for idx, (i, j) in enumerate(zip(*tril_idx)): arr[i][j] = hess_elem[idx] arr = arr + (arr.T - np.eye(3*nat)*arr) df = pd.DataFrame(arr) self.hessian = df
def __init__(self, *args, **kwargs): super(Fchk, self).__init__(*args, **kwargs)
# def _dedup(sets, sp=False): # unique, setmap, cnt = [], {}, 0 # sets = sets.groupby('center') # chk = [0, 1] # for center, seht in sets: # for i, other in enumerate(unique): # if other.shape != seht.shape: continue # if np.allclose(other[chk], seht[chk]): # setmap[center] = i # break # else: # unique.append(seht) # setmap[center] = cnt # cnt += 1 # if sp: unique = _expand_sp(unique) # sets = pd.concat(unique).reset_index(drop=True) # try: sets.drop([2, 3], axis=1, inplace=True) # except ValueError: pass # sets.rename(columns={'center': 'set', 0: 'alpha', 1: 'd'}, inplace=True) # sets['set'] = sets['set'].map(setmap) # sets['frame'] = 0 # return sets, setmap # # # def _expand_sp(unique): # expand = [] # for seht in unique: # if np.isnan(seht[2]).sum() == seht.shape[0]: # expand.append(seht) # continue # sps = seht[2][~np.isnan(seht[2])].index # shls = len(seht.ix[sps]['shell'].unique()) # dupl = seht.ix[sps[0]:sps[-1]].copy() # dupl[1] = dupl[2] # dupl['L'] = 1 # dupl['shell'] += shls # last = seht.ix[sps[-1] + 1:].copy() # last['shell'] += shls # expand.append(pd.concat([seht.ix[:sps[0] - 1], # seht.ix[sps[0]:sps[-1]], # dupl, last])) # return expand def _basis_set_order(chunk, mapr, sets): # Gaussian only prints the atom center # and label once for all basis functions first = len(chunk[0]) - len(chunk[0].lstrip(' ')) + 1 df = pd.read_fwf(six.StringIO('\n'.join(chunk)), widths=[first, 4, 3, 2, 4], header=None) df[1].fillna(method='ffill', inplace=True) df[1] = df[1].astype(np.int64) - 1 df[2].fillna(method='ffill', inplace=True) df.rename(columns={1: 'center', 3: 'N', 4: 'ang'}, inplace=True) df['N'] = df['N'].astype(np.int64) - 1 if 'XX' in df['ang'].values: df[['L', 'l', 'm', 'n']] = df['ang'].map({'S': [0, 0, 0, 0], 'XX': [2, 2, 0, 0], 'XY': [2, 1, 1, 0], 'XZ': [2, 1, 0, 1], 'YY': [2, 0, 2, 0], 'YZ': [2, 0, 1, 1], 'ZZ': [2, 0, 0, 2], 'PX': [1, 1, 0, 0], 'PY': [1, 0, 1, 0], 'PZ': [1, 0, 0, 1], }).apply(tuple).apply(pd.Series) else: df['L'] = df['ang'].str[:1].str.lower().map(lmap).astype(np.int64) df['ml'] = df['ang'].str[1:] df['ml'].update(df['ml'].map({'': 0, 'X': 1, 'Y': -1, 'Z': 0})) df['ml'] = df['ml'].astype(np.int64) cnts = {key: -1 for key in range(10)} pcen, pl, pn, shfns = 0, 0, 1, [] for cen, n, l, seht in zip(df['center'], df['N'], df['L'], df['center'].map(sets)): if not pcen == cen: cnts = {key: -1 for key in range(10)} if (pl != l) or (pn != n) or (pcen != cen): cnts[l] += 1 shfns.append(mapr[(seht, l)][cnts[l]]) pcen, pl, pn = cen, l, n df['shell'] = shfns df.drop([0, 2, 'N', 'ang'], axis=1, inplace=True) df['frame'] = 0 return df