Source code for exatomic.molcas.output

# -*- coding: utf-8 -*-
# Copyright (c) 2015-2022, Exa Analytics Development Team
# Distributed under the terms of the Apache License 2.0
"""
Molcas Output Parser
#####################
Multiple frames are not currently supported
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import os
import re
import six
import pandas as pd
import numpy as np
from six import StringIO
from exatomic.exa import TypedMeta
import exatomic
from .editor import Editor
from exatomic.core import Atom, Gradient, Frequency
from exatomic.algorithms.numerical import _flat_square_to_triangle, _square_indices
from exatomic.algorithms.basis import lmap, spher_lml_count
from exatomic.core.basis import (Overlap, BasisSet, BasisSetOrder,
                                 deduplicate_basis_sets)
from exatomic.core.orbital import DensityMatrix, MOMatrix, Orbital
from exatomic.base import sym2z, z2sym


[docs]class OrbMeta(TypedMeta):
    momatrix = MOMatrix
    orbital = Orbital


[docs]class Orb(six.with_metaclass(OrbMeta, Editor)):
    """
    Parser for molcas coefficient matrix dumps (e.g. RasOrb).

    Note:
        This parser assumes the file contains data from a single
        calculation (i.e. a single frame).
    """
[docs]    def to_universe(self):
        raise NotImplementedError("This editor has no parse_atom method.")


    def _read_one(self, found, keys, start, stop, osh, old, ret, key):
        if not old:
            # In order for this not to break if someone decides
            # to change the print width format, compute it here
            ln = self[start]
            width = ln[3:].index(' ') + 3
            keys['widths'] = [width] * len(ln.split())
            keys['names'] = range(len(keys['widths']))
        df = pd.read_fwf(StringIO('\n'.join(self[start:stop])), **keys)
        if not osh:
            ret.update({key: df.stack().dropna().values})
        else: # Clean for open shell calcs
            rm = found[1] - found[0] - 1
            df.drop([rm - 1, rm], inplace=True)
            df[0] = df[0].astype(np.float64)
            df = df.stack().dropna().astype(np.float64).values
            ret.update({key: df[:len(df)//2],
                        key+'1': df[len(df)//2:]})


[docs]    def parse_momatrix(self):
        _re_orb = 'ORBITAL'
        _re_occ = 'OCCUPATION NUMBERS'
        _re_ens = 'ONE ELECTRON ENERGIES'
        _re_osh = '#UORB'
        _re_idx = '#INDEX'
        _re_hmn = 'human'
        found = self.find(_re_orb, _re_occ, _re_ens,
                          _re_osh, _re_idx, keys_only=True)
        found[_re_hmn] = [i for i in found[_re_occ]
                          if _re_hmn in self[i].lower()]
        # Dimensions per irrep
        dims = list(map(int, self[5].split()))
        ndim = sum(dims)
        osh = len(found[_re_osh]) > 0
        start = found[_re_orb][0] + 1
        stops = found[_re_occ] + found[_re_ens]
        stop = min(stops) - 1
        # Old file format
        old = len(self[start].split()) == 1
        # MOMatrix table
        widths = [18] * 4 if old else [22] * 5
        kws = {'widths': widths, 'names': range(len(widths))}
        df = pd.read_fwf(StringIO('\n'.join(self[start:stop])),
                         widths=widths, names=range(len(widths)))
        df.drop(df[df[0].str.contains(_re_orb)].index, inplace=True)
        # Orbital occupations
        mo, orb = {}, {}
        start = found[_re_occ][0] + 1
        stops = found[_re_hmn] + found[_re_ens] + found[_re_idx] + [len(self) + 1]
        stop = min(stops) - 1
        self._read_one(found[_re_occ], kws, start, stop,
                       osh, old, orb, 'occupation')
        # Orbital energies
        if found[_re_ens]:
            start = found[_re_ens][0] + 1
            stop = found[_re_idx][0]
            self._read_one(found[_re_ens], kws, start, stop,
                           osh, old, orb, 'energy')
        else:
            orb.update({'energy': 0})

        # Get all the groupby indices
        if len(dims) > 1: # Symmetrized calc.
            mo.update({
                'irrep': [i for i, d in enumerate(dims) for _ in range(d * d)],
                'orbital': [i for d in dims for i in range(d) for _ in range(d)],
                'chi': [i for d in dims for _ in range(d) for i in range(d)]})
            orb.update({'vector': [j for d in dims for j in range(d)],
                        'irrep': [i for i, d in enumerate(dims) for _ in range(d)]})
        else:
            ordx, chidx = _square_indices(ndim)
            mo.update({'orbital': ordx, 'chi': chidx})
            orb.update({'vector': range(ndim)})

        # Unused groupby indices
        orb.update({'group': 0, 'spin': 0, 'frame': 0})
        mo.update({'frame': 0})

        if not osh:
            df[0] = df[0].astype(np.float64)
            mo.update({'coef': df.stack().dropna().values})
        else: # Open shell calc.
            off = found[_re_orb][0] + 1
            df.drop(found[_re_osh][0] - off, inplace=True)
            df[0] = df[0].astype(np.float64)
            coef = df.stack().dropna().values
            mo.update({'coef': coef[:len(coef)//2],
                       'coef1': coef[len(coef)//2:]})

        self.momatrix = pd.DataFrame.from_dict(mo)
        self.orbital = pd.DataFrame.from_dict(orb)


    def __init__(self, *args, **kwargs):
        super(Orb, self).__init__(*args, **kwargs)



[docs]class OutMeta(TypedMeta):
    atom = Atom
    gradient = Gradient
    basis_set = BasisSet
    basis_set_order = BasisSetOrder
    sf_dipole_moment = pd.DataFrame
    sf_quadrupole_moment = pd.DataFrame
    sf_angmom = pd.DataFrame
    sf_energy = pd.DataFrame
    so_energy = pd.DataFrame
    sf_oscillator = pd.DataFrame
    so_oscillator = pd.DataFrame
    frequency = Frequency
    natural_occ = pd.DataFrame
    caspt2_energy = pd.DataFrame


[docs]class Output(six.with_metaclass(OutMeta, Editor)):
    _resta = "STATE"
    def _property_parsing(self, props, data_length):
        ''' Helper method for parsing the spin-free properties sections. '''
        all_dfs = []
        for idx, prop in enumerate(props):
            # find where the data blocks are printed
            starts = np.array(self.find(self._resta, start=prop, keys_only=True)) + prop + 2
            # data_length should always be the same
            stops = starts + data_length
            # use np.ceil as if we have a 7x7 matrix we get one data set
            # with 4 columns and one with 3 columns
            n = int(np.ceil(data_length/4))
            dfs = []
            # grab all of the data
            # we use all hits up to n as there are many places with the
            # 'STATE' keyword so we want to just grab the data starting
            # at the property keyword
            for ndx, (start, stop) in enumerate(zip(starts[:n], stops[:n])):
                # should be four in all but the last data set
                ncols = len(self[start-2].split())
                df = self.pandas_dataframe(start, stop, ncol=ncols)
                # the only column we do not need will always be the
                # column with name 0
                df.drop(0, axis=1, inplace=True)
                # set the columns as they should be
                df.columns = list(range(ndx*4, ndx*4+ncols-1))
                dfs.append(df)
            # put the component together
            all_dfs.append(pd.concat(dfs, axis=1))
            all_dfs[-1]['component'] = idx
        df = pd.concat(all_dfs, ignore_index=True)
        return df

    def _oscillator_parsing(self, start_idx):
        ''' Helper method to parse the oscillators. '''
        ldx = start_idx + 6
        oscillators = []
        while '-----' not in self[ldx]:
            oscillators.append(self[ldx].split())
            ldx += 1
        df = pd.DataFrame(oscillators)
        df.columns = ['nrow', 'ncol', 'oscil', 'a_x', 'a_y', 'a_z', 'a_tot']
        df[['nrow', 'ncol']] = df[['nrow', 'ncol']].astype(int)
        df[['nrow', 'ncol']] -= [1, 1]
        df[['nrow', 'ncol']] = df[['nrow', 'ncol']].astype('category')
        cols = ['oscil', 'a_x', 'a_y', 'a_z', 'a_tot']
        df[cols] = df[cols].astype(np.float64)
        return df

[docs]    def add_orb(self, path, mocoefs='coef', orbocc='occupation'):
        """
        Add a MOMatrix and Orbital table to a molcas.Output. If path is
        an Editor containing momatrix and orbital tables then adds them
        directly, otherwise assumes it is a molcas.Orb file.

        Args:
            path (str, :class:`exatomic.core.editor.Editor`): path to file or Editor object
            mocoefs (str): rename coefficients
            orbocc (str): rename occupations
        """
        if isinstance(path, exatomic.Editor): orb = path
        else: orb = Orb(path)
        if mocoefs != 'coef' and orbocc == 'occupation':
            orbocc = mocoefs
        # MOMatrix
        curmo = getattr(self, 'momatrix', None)
        if curmo is None:
            self.momatrix = orb.momatrix
            if mocoefs != 'coef':
                self.momatrix.rename(columns={'coef': mocoefs}, inplace=True)
        else:
            if mocoefs in self.momatrix.columns:
                raise ValueError('This action would overwrite '
                                 'coefficients. Specify mocoefs parameter.')
            for i, default in enumerate(['coef', 'coef1']):
                final = mocoefs + '1' if i else mocoefs
                if default in orb.momatrix:
                    self.momatrix[final] = orb.momatrix[default]
        # Orbital
        curorb = getattr(self, 'orbital', None)
        if curorb is None:
            self.orbital = orb.orbital
            if orbocc != 'occupation':
                self.orbital.rename(columns={'occupation': orbocc}, inplace=True)
        else:
            if orbocc in self.orbital.columns:
                raise ValueError('This action would overwrite '
                                 'occupations. Specify orbocc parameter.')
            for i, default in enumerate(['occupation', 'occupation1']):
                final = orbocc + '1' if i else orbocc
                if default in orb.orbital.columns:
                    self.orbital[final] = orb.orbital[default]


[docs]    def add_overlap(self, path):
        try: # If it's an ASCII text file
            self.overlap = Overlap.from_column(path)
        except Exception: # If it's an HDF5 file
            hdf = HDF(path)
            if 'DESYM_CENTER_CHARGES' not in hdf._hdf:
                self.overlap = hdf.overlap
                return
            if 'irrep' not in self.momatrix:
                raise Exception("Trying to set symmetrized overlap with "
                                "desymmetrized MOMatrix data.")
            ovl = pd.DataFrame(np.array(hdf._hdf['AO_OVERLAP_MATRIX']),
                               columns=('coef',))
            ovl['irrep'] = self.momatrix['irrep']
            ovl['chi0'] = self.momatrix['chi']
            ovl['chi1'] = self.momatrix['orbital']
            ovl['frame'] = 0
            self.overlap = ovl


    def _check_atom_sym(self):
        """Parses a less accurate atom list to check for symmetry."""
        _re_sym = 'Cartesian Coordinates / Bohr, Angstrom'
        start = self.find(_re_sym, keys_only=True)[0] + 4
        cols = ['center', 'tag', 'x', 'y', 'z', 'd1', 'd2', 'd3']
        stop = start
        while len(self[stop].split()) == len(cols): stop += 1
        atom = self.pandas_dataframe(start, stop, cols)
        atom.drop(['d1', 'd2', 'd3'], axis=1, inplace=True)
        atom['symbol'] = atom['tag'].str.extract(
            '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title()
        atom['frame'] = 0
        atom['center'] -= 1
        return atom


[docs]    def parse_atom(self, seward=True):
        """Parses the atom list generated in SEWARD."""
        if seward:
            _re_atom0 = 'Label   Cartesian Coordinates'
            _re_atom1 = 'Center  Label'
            found = self.find(_re_atom0, _re_atom1, keys_only=True)
            if found[_re_atom0]:
                accurate = True
                starts = [i + 2 for i in found[_re_atom0]]
            else:
                accurate = False
                starts = [i + 1 for i in found[_re_atom1]]
            stops = starts[:]    # Copy the list
            for i in range(len(stops)):
                while len(self[stops[i]].strip().split()) > 3:
                    stops[i] += 1
                    if not self[stops[i]].strip(): break
                stops[i] -= 1
            if accurate:
                lns = StringIO('\n'.join([self._lines[i] for j in (range(i, j + 1)
                                         for i, j in zip(starts, stops)) for i in j]))
                cols = ['tag', 'x', 'y', 'z']
            else:
                lns = StringIO('\n'.join(self[starts[0]:stops[0] + 1]))
                cols = ['center', 'tag', 'x', 'y', 'z', 'xa', 'ya', 'za']
            atom = pd.read_csv(lns, delim_whitespace=True,
                               names=cols)
            if len(cols) == 8:
                atom.drop(['xa', 'ya', 'za'], axis=1, inplace=True)
            atom['symbol'] = atom['tag'].str.extract(
                '([A-z]{1,})([0-9]*)', expand=False)[0].str.lower().str.title()
            atom['Z'] = atom['symbol'].map(sym2z).astype(np.int64)
            atom['center'] = range(atom.shape[0])
            atom['frame'] = 0
            self.atom = atom
            # Work-around for symmetrized calcs?
            allatom = self._check_atom_sym()
            self.meta['symmetrized'] = False
            if allatom.shape[0] > self.atom.shape[0]:
                self.atom = allatom
                self.meta['symmetrized'] = True
                try:
                    self.atom['utag'] = _add_unique_tags(self.atom)
                except ValueError:
                    pass
        else:
            _reatom = "Nuclear coordinates for the next iteration / Bohr"
            _reatom02 = "Nuclear coordinates of the final structure / Bohr"
            found = self.find(_reatom, _reatom02, keys_only=True)
            if len(found[_reatom02]) > 1:
                raise ValueError("Found more than one final atom table.")
            elif len(found[_reatom02]) == 0:
                raise ValueError("Could not find a final atom table in the file.")
            starts = np.array(found[_reatom])
            starts = np.array(list(starts)+found[_reatom02])+3
            stop = starts[0]
            while self[stop].strip(): stop += 1
            stops = starts + (stop - starts[0])
            cols = ['symbol', 'x', 'y', 'z']
            dfs = []
            for idx, (start, stop) in enumerate(zip(starts, stops)):
                df = self.pandas_dataframe(start, stop, ncol=4)
                df.columns = cols
                label = np.concatenate(list(map(lambda x: re.findall(r'\D+', x),
                                                df['symbol'])))
                if label.shape[0] > df.shape[0]:
                    text = "There was an issue with interpreting the labels " \
                           +"of the atom table. Expected {} labels but " \
                           +"interpreted {}"
                    raise ValueError(text.format(df.shape[0], label.shape[0]))
                df['symbol'] = label
                df['Z'] = df['symbol'].map(sym2z)
                df['set'] = range(df.shape[0])
                df['frame'] = idx
                dfs.append(df)
            atom = pd.concat(dfs, ignore_index=True)
            self.atom = atom

[docs]    def parse_gradient(self):
        _regrad = "Molecular gradients"
        _reirr = "Irreducible representation:"
        found = self.find(_regrad, _reirr, keys_only=True)
        if not found[_regrad]:
            return
        if len(found[_regrad]) != len(found[_reirr]):
            text = "Do not have support for multiple irreducible " \
                   +"representations yet."
            raise NotImplementedError(text)
        starts = np.array(found[_regrad]) + 8
        stop = starts[0]
        while self[stop].strip()[:2] != '--': stop += 1
        stops = starts + (stop - starts[0])
        dfs = []
        for idx, (start, stop) in enumerate(zip(starts, stops)):
            df = self.pandas_dataframe(start, stop, ncol=4)
            cols = ['symbol', 'fx', 'fy', 'fz']
            df.columns = cols
            label = np.concatenate(list(map(lambda x: re.findall(r'\D+', x),
                                            df['symbol'])))
            df['symbol'] = label
            df['atom'] = range(df.shape[0])
            df['frame'] = idx
            df['Z'] = df['symbol'].map(sym2z)
            dfs.append(df)
        grad = pd.concat(dfs, ignore_index=True)
        self.gradient = grad

[docs]    def parse_frequency(self, linear=False, normalize=True):
        _refreq = "Frequency:"
        _reint = "Intensity:"
        _remass = "Red. mass:"
        _rerot = "Note that rotational and translational degrees " \
                 +"have been automatically removed,"
        found = self.find(_refreq, _reint, _rerot, _remass)
        if not found[_refreq]:
            return
        start = found[_remass][-1][0] + 2
        stop = found[_remass][-1][0] + 2
        while self[stop].strip(): stop += 1
        lines = stop - start
        dfs = []
        count = 0
        arr = zip(found[_refreq], found[_reint], found[_remass])
        for idx, ((_, freq), (_, inten), (ldx, mass)) in enumerate(arr):
            # parse the normal modes
            start = ldx + 2
            stop = start + lines
            arr = self.pandas_dataframe(start, stop, ncol=8)
            arr.dropna(how='all', axis=1, inplace=True)
            # extract the atomic symbols
            symbols = arr[0].drop_duplicates()
            symbols = list(map(lambda x: re.findall(r'\D+', x)[0], symbols))
            nat = len(symbols)
            if linear and idx == 0 and found[_rerot]:
                arr.drop([2, 3, 4, 5, 6], axis=1, inplace=True)
            elif not found[_rerot] and idx == 0 and not linear:
                print("Found rotational modes in the calculation")
                continue
            # set the columns where the normal modes are on the table
            cols = range(2, arr.columns.max()+1)
            arr.drop([0], axis=1, inplace=True)
            # re-organize the normal mode data
            tmp = arr.groupby(1).apply(lambda x: x[cols].values.T.flatten())
            df = pd.DataFrame(tmp.to_dict())
            df.columns = ['dx', 'dy', 'dz']
            errortext = "Something went wrong when trying to turn the {} " \
                        +"into floating point numbers."
            # get the frequencies
            # skip the first element as it is the table label
            freqs = freq.split()[1:]
            # replace all imaginary signs with '-'
            freqs = list(map(lambda x: x.replace('i', '-'), freqs))
            # turn them into floats
            try:
                freqs = list(map(float, freqs))
            except ValueError:
                print(freqs)
                raise ValueError(errortext.format('frequencies'))
            # get the reduced masses
            # skip the first element as it is the table label
            rmass = mass.split()[2:]
            # turn them into floats
            try:
                rmass = list(map(float, rmass))
            except ValueError:
                print(rmass)
                raise ValueError(errortext.format('reduced masses'))
            # get the IR intensities
            # skip the first element as it is the table label
            irint = inten.split()[1:]
            # turn them into floats
            try:
                irint = list(map(float, irint))
            except ValueError:
                print(irint)
                raise ValueError(errortext.format('IR intensities'))
            # put everything together
            df['symbol'] = np.tile(symbols, len(cols))
            df['label'] = np.tile(range(nat), len(cols))
            df['Z'] = df['symbol'].map(sym2z)
            df['frequency'] = np.repeat(freqs, nat)
            df['r_mass'] = np.repeat(rmass, nat)
            df['ir_int'] = np.repeat(irint, nat)
            df.loc[df['ir_int'].abs() < 1e-9, 'ir_int'] = 0
            df['freqdx'] = np.repeat(range(count*6, (count+1)*len(cols)), nat)
            count += 1
            df['frame'] = 0
            if normalize and found[_rerot]:
                df[['dx', 'dy', 'dz']] *= np.sqrt(df['r_mass'].values.reshape(-1,1))
            elif normalize and not found[_rerot]:
                pass
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)
        cols = ['Z', 'label', 'dx', 'dy', 'dz', 'frequency', 'freqdx',
                'ir_int', 'r_mass', 'symbol', 'frame']
        self.frequency = df[cols]

[docs]    def parse_basis_set_order(self):
        """
        Parses the shell ordering scheme if BSSHOW specified in SEWARD.
        """
        _re_bas_order = 'Basis Label        Type   Center'
        starts = [i + 1 for i in self.find(_re_bas_order, keys_only=True)]
        lines, irreps, vecs, vec, nsym = [], [], [], 0, 0
        for i, start in enumerate(starts):
            stop = start
            while self[stop].strip():
                lines.append(stop)
                irreps.append(i)
                vecs.append(vec)
                nsym = max(nsym, len(self[stop].split()))
                stop += 1
                vec += 1
            lines.append(stop)
            vec = 0
        symcols = [('ocen{}'.format(i), 'sign{}'.format(i))
                    for i in range((nsym - 5) // 2)]
        if symcols: self.meta['symmetrized'] = True
        cols = ['idx', 'tag', 'type', 'center', 'phase'] + \
               [c for t in symcols for c in t]
        df = pd.read_csv(StringIO('\n'.join(self[i] for i in lines)),
                         delim_whitespace=True, names=cols)
        # Symmetrized basis functions
        for col in df.columns:
            if col.startswith('ocen'):
                df[col] = df[col].fillna(0.).astype(np.int64) - 1
        # Extra info for symmetrized calcs
        df['irrep'] = irreps
        df['vector'] = vecs
        df.drop(['idx'], inplace=True, axis=1)
        df['frame'] = 0
        # 0-based indexing
        df['center'] -= 1
        # Clean up (L, ml) values to ints
        df['L'] = df['type'].str[1].map(lmap)
        df['ml'] = df['type'].str[2:]
        df['ml'].update(df['ml'].map({'': 0, 'x': 1, 'y': -1, 'z': 0}))
        try:
            df['ml'].update(df['ml'].str[::-1])
        except Exception:
            pass
        df['ml'] = df['ml'].astype(np.int64)
        # Seward and gateway may each print basis info.
        # See if this happened and if so, keep only the first half.
        try: # Apparently BSSHOW doesn't always print the basis set.
            if 'set' not in self.atom.columns:
                self.parse_basis_set()
        except ValueError:
            self.basis_set_order = df
            return
        bs = self.basis_set
        sp = self.meta['spherical']
        nbf = self.atom.set.map(bs.functions(sp).groupby('set').sum()).sum()
        if df.shape[0] > nbf:
            df = df.loc[:nbf - 1]
            irreps = irreps[:nbf]
            vecs = vecs[:nbf]
        self.basis_set_order = df
        shls = []
        grps = df.groupby(['irrep', 'center', 'L', 'ml'])
        for (_, _, L, ml), grp in grps:    # (? cen L, ml)
            shl = 0
            for _ in grp.index:
                shls.append(shl)
                shl += 1
        self.basis_set_order['shell'] = shls


[docs]    def parse_basis_set(self):
        """Parses the primitive exponents, coefficients and
        shell if BSSHOW specified in SEWARD."""
        _re_bas_0 = 'Shell  nPrim  nBasis  Cartesian Spherical Contaminant'
        _re_bas_1 = 'Label   Cartesian Coordinates / Bohr'
        _re_bas_2 = 'No.      Exponent    Contraction Coefficients'
        found = self.find(_re_bas_0, _re_bas_1, _re_bas_2, keys_only=True)
        bmaps = [i + 1 for i in found[_re_bas_0]]
        atoms = [i + 2 for i in found[_re_bas_1]]
        alphs = [i + 1 for i in found[_re_bas_2]]
        widths = [11, 7, 8, 11, 10, 12]
        names = _re_bas_0.split()
        setmap, basmap = {}, []
        for seht, (start, atst) in enumerate(zip(bmaps, atoms)):
            stop = start
            while self[stop].strip(): stop += 1
            while self[atst].strip():
                setmap[self[atst].split()[0]] = seht
                atst += 1
            basmap.append(pd.read_fwf(StringIO('\n'.join(self[start:stop])),
                                      widths=widths, header=None, names=names))
            basmap[-1]['set'] = seht
        self.atom['set'] = self.atom['tag'].map(setmap)
        basmap = pd.concat(basmap).reset_index(drop=True)
        basmap['Shell'] = basmap['Shell'].map(lmap)
        prims, pset, shell = [], 0, 0
        for start, seht, L, nprim, nbas in zip(alphs, basmap['set'], basmap['Shell'],
                                               basmap['nPrim'], basmap['nBasis']):
            if pset != seht: shell = 0
            # In case contraction coefficients overflow to next line
            nmatch = len(self[start].split())
            neat = nmatch == len(self[start + 1].split())
            if neat:
                block = self.pandas_dataframe(start, start + nprim, nbas + 2)
            else:
                # Extra obfuscation to handle exotic cases
                ext = 1
                while nmatch != len(self[start + ext].split()):
                    ext += 1
                stop = start + ext * nprim
                collated = [''.join(self[start + i * ext : start + i * ext + ext])
                            for i in range(nprim)]
                ncols = len(collated[0].split())
                block = pd.read_csv(StringIO('\n'.join(collated)),
                                    delim_whitespace=True, names=range(ncols))
            alps = (pd.concat([block[1]] * nbas).reset_index(drop=True)
                    .str.replace('D', 'E').astype(np.float64))
            ds = block[list(range(2, nbas + 2))].unstack().reset_index(drop=True)
            pdf = pd.concat([alps, ds], axis=1)
            pdf.columns = ['alpha', 'd']
            pdf['L'] = L
            pdf['shell'] = np.repeat(range(shell, shell + nbas), nprim)
            pdf['set'] = seht
            prims.append(pdf)
            shell += nbas
            pset = seht
        prims = pd.concat(prims).reset_index(drop=True)
        prims['frame'] = 0
        self.basis_set = prims
        self.meta['spherical'] = True
        if self.basis_set.lmax < 2:
            self.meta['spherical'] = False

[docs]    def parse_sf_dipole_moment(self):
        ''' Get the Spin-Free electric dipole moment from RASSI '''
        # define the search string
        _retdm = "PROPERTY: MLTPL  1"
        component_map = {0: 'x', 1: 'y', 2: 'z'}
        found = self.find(_retdm, keys_only=True)
        if not found:
            return
        if len(found) > 6:
            # in new version of molcas they introduced
            # extra matrices that are printed so this
            # was added due to that change
            props = np.array(found)[:6:2]
        else:
            props = np.array(found)[:3]
        # find the number of lines that the data spans
        stop = props[0] + 5
        while self[stop].strip(): stop += 1
        data_length = stop - props[0] - 5
        # get the data
        stdm = self._property_parsing(props, data_length)
        stdm['component'] = stdm['component'].map(component_map)
        self.sf_dipole_moment = stdm

[docs]    def parse_sf_quadrupole_moment(self):
        ''' Get the Spin-Free electric quadrupole moment from RASSI '''
        _requad = "PROPERTY: MLTPL  2"
        component_map = {0: 'xx', 1: 'xy', 2: 'xz', 3: 'yy', 4: 'yz', 5: 'zz'}
        found = self.find(_requad, keys_only=True)
        if not found:
            return
        # molcas prints the upper triangular tensor
        # elements of the full quadrupole tensor
        props = np.array(found)[:6]
        # find the number of lines that the data spans
        stop = props[0] + 5
        while self[stop].strip(): stop += 1
        data_length = stop - props[0] - 5
        # get the data
        sqdm = self._property_parsing(props, data_length)
        sqdm['component'] = sqdm['component'].map(component_map)
        self.sf_quadrupole_moment = sqdm

[docs]    def parse_sf_angmom(self):
        ''' Get the Spin-Free angular momentum from RASSI '''
        _reangm = "PROPERTY: ANGMOM"
        component_map = {0: 'x', 1: 'y', 2: 'z'}
        found = self.find(_reangm, keys_only=True)
        if not found:
            return
        props = np.array(found)[:3]
        # find the number of lines that the data spans
        stop = props[0] + 5
        while self[stop].strip(): stop += 1
        data_length = stop - props[0] - 5
        # get the data
        sangm = self._property_parsing(props, data_length)
        sangm['component'] = sangm['component'].map(component_map)
        self.sf_angmom = sangm

[docs]    def parse_sf_energy(self):
        ''' Get the Spin-Free energies from RASSI or RASSCF '''
        _reenerg = " RASSI State "
        _reenerg_rasscf = " RASSCF root number"
        found = self.find(_reenerg, _reenerg_rasscf)
        key = ''
        if found[_reenerg]:
            if found[_reenerg_rasscf]:
                msg = "Found RASSCF and RASSI Spin-Free energies.\n" \
                      +"Will only parse the RASSI Spin-Free energies."
                print(msg)
            key = _reenerg
        elif found[_reenerg_rasscf]:
            key = _reenerg_rasscf
        else:
            return
        # should not be necessary but you never know
        if key == '':
            msg = "There was an issue in determining the key to " \
                  +"be used in the parsing of the SF energies. " \
                  +"Please open an issue ticket as this should " \
                  +"never show."
            raise ValueError(msg)
        energies = []
        for _, line in found[key]:
            energy = float(line.split()[-1])
            energies.append(energy)
        # should always be the case that the first energy is the
        # ground state minimum energy
        rel_energy = list(map(lambda x: x - energies[0], energies))
        df = pd.DataFrame.from_dict({'energy': energies, 'rel_energy': rel_energy})
        self.sf_energy = df

[docs]    def parse_so_energy(self):
        ''' Get the Spin-Orbit energies from RASSI '''
        _reenerg = " SO-RASSI State "
        found = self.find(_reenerg)
        if not found:
            return
        energies = []
        for _, line in found:
            energy = float(line.split()[-1])
            energies.append(energy)
        rel_energy = list(map(lambda x: x - energies[0], energies))
        df = pd.DataFrame.from_dict({'energy': energies, 'rel_energy': rel_energy})
        self.so_energy = df

[docs]    def parse_sf_oscillator(self):
        ''' Get the printed Spin-Free oscillators from RASSI '''
        # TODO: check how this has to be adjusted for different
        #       molcas print levels
        _reosc = "++ Dipole transition strengths (spin-free states):"
        found = self.find(_reosc, keys_only=True)
        if not found:
            return
        if len(found) > 1:
            raise NotImplementedError("We have found more than one key for the spin-free " \
                                      +"oscillators.")
        df = self._oscillator_parsing(found[0])
        self.sf_oscillator = df

[docs]    def parse_so_oscillator(self):
        ''' Get the printed Spin-Orbit oscillators from RASSI '''
        # TODO: check how this has to be adjusted for different
        #       molcas print levels
        _reosc = "++ Dipole transition strengths (SO states):"
        found = self.find(_reosc, keys_only=True)
        if not found:
            return
        if len(found) > 1:
            raise NotImplementedError("We have found more than one key for the spin-orbit " \
                                      +"oscillators.")
        df = self._oscillator_parsing(found[0])
        self.so_oscillator = df

[docs]    def parse_natural_occ(self):
        # define search string for natural occupations
        _renoo = "Natural orbitals and occupation numbers for root"
        # define search string for energies per root
        _reeng = "energy="
        found = self.find(_renoo, _reeng)
        if not found[_renoo]:
            return
        # get the occupations, roots, and energies of each root
        occs = {}
        roots = []
        energies = []
        for (nooldx, nooline), (_, engline) in zip(found[_renoo], found[_reeng]):
            energy = float(engline.split('=')[-1].strip())
            energies.append(energy)
            root = list(map(int, re.findall(r'\d+', nooline)))
            # just an error check should never actually happen
            if len(root) != 1:
                raise ValueError("Found more than one group of numbers where one " \
                                 +"number should have been listed for the root. " \
                                 +"Found line {}".format(nooline))
            roots.append(root[0])
            # get the natural occupations
            dldx = nooldx+1
            vals = {}
            while self[dldx].strip() != '':
                d = self[dldx].split()
                if d[0] == 'sym':
                    arr = list(map(float, d[2:]))
                    symm = int(d[1].replace(':', ''))
                    if root[0] == 1:
                        occs[symm] = []
                    vals[symm] = []
                else:
                    arr = list(map(float, d))
                vals[symm].append(arr)
                dldx += 1
            for key, item in vals.items():
                occs[key].append(list(np.concatenate(item)))
        dfs = []
        for symm, occ in occs.items():
            df = pd.DataFrame(occ)
            df['symmetry'] = symm
            df['root'] = roots
            df['energy'] = energies
            df['rel_energy'] = df['energy'] - df['energy'].values.min()
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)
        self.natural_occ = df.sort_values(by=['symmetry', 'root']).reset_index(drop=True)

[docs]    def parse_caspt2_energy(self):
        # parsing strings
        _resspt2 = " CASPT2 Root"
        _remspt2 = " MS-CASPT2 Root"
        _reref = "Reference energy:"
        _reweight = "Reference weight:"
        _reroot = "Compute H0 matrices for state"
        _respin = "Spin quantum number"
        found = self.find(_resspt2, _reref, _reweight, _reroot, _respin,
                          _remspt2)
        if not found[_resspt2]:
            return
        # TODO: this needs to be verified as it assumes that there is
        #       only one spin/multiplicity in the system
        #       do not know if this is always the case
        # get the spin quantum number of the system
        spin = float(found[_respin][0][1].split()[-1])
        # get the multiplicity
        mult = np.repeat(int(2*spin + 1), len(found[_remspt2]))
        sspt2 = [] # SSPT2 energies
        mspt2 = [] # MSPT2 energies
        reference = [] # reference energies from the wavefunction file
        weights = [] # reference weight (not sure what it actually is)
        roots = [] # root index
        arr = zip(found[_resspt2], found[_reref], found[_reweight],
                  found[_reroot], found[_remspt2])
        # grab all of the values we are interested in
        for (_, ss), (_, ref), (_, weight), (_, root), (_, ms) in arr:
            sspt2.append(float(ss.split()[-1]))
            reference.append(float(ref.split()[-1]))
            weights.append(float(weight.split()[-1]))
            roots.append(int(re.findall(r'\d+', root)[-1]))
            mspt2.append(float(ms.split()[-1]))
        df = pd.DataFrame.from_dict({'sspt2_au': sspt2, 'mspt2_au': mspt2,
                                     'ras_au': reference, 'weight': weights,
                                     'root': roots, 'mult': mult})
        self.caspt2_energy = df

    def __init__(self, *args, **kwargs):
        super(Output, self).__init__(*args, **kwargs)


[docs]class HDFMeta(TypedMeta):
    atom = Atom
    overlap = Overlap
    orbital = Orbital
    momatrix = MOMatrix
    basis_set = BasisSet
    basis_set_order = BasisSetOrder


[docs]class HDF(six.with_metaclass(HDFMeta, object)):

    _getter_prefix = 'parse'
    _to_universe = Editor._to_universe

[docs]    def to_universe(self):
        return self._to_universe()

[docs]    def parse_atom(self):
        ztag = 'CENTER_CHARGES'
        xtag = 'CENTER_COORDINATES'
        ltag = 'CENTER_LABELS'
        self.meta['symmetrized'] = False
        if 'DESYM_CENTER_CHARGES' in self._hdf:
            self.meta['symmetrized'] = True
            ztag = 'DESYM_' + ztag
            xtag = 'DESYM_' + xtag
            ltag = 'DESYM_' + ltag
        Z = pd.Series(self._hdf[ztag]).astype(np.int64)
        xyzs = np.array(self._hdf[xtag])
        labs = pd.Series(self._hdf[ltag]).apply(
                         lambda s: s.decode('utf-8').strip())
        self.atom = pd.DataFrame.from_dict({'Z': Z,
                                            'x': xyzs[:, 0],
                                            'y': xyzs[:, 1],
                                            'z': xyzs[:, 2],
                                            'center': range(len(Z)),
                                            'symbol': Z.map(z2sym),
                                            'label': labs,
                                            'frame': 0})
        if self.meta['symmetrized']:
            symops = {'E': np.array([ 1.,  1.,  1.]),
                      'x': np.array([-1.,  0.,  0.]),
                      'y': np.array([ 0., -1.,  0.]),
                      'z': np.array([ 0.,  0., -1.]),
                     'xy': np.array([-1., -1.,  0.]),
                     'xz': np.array([-1.,  0., -1.]),
                     'yz': np.array([ 0., -1., -1.]),
                    'xyz': np.array([-1., -1., -1.])}
            self.meta['symops'] = symops
            self.atom[['tag', 'symop']] = self.atom['label'].str.extract('(.*):(.*)',
                                                                         expand=True)
            self.atom['tag'] = self.atom['tag'].str.strip()
            self.atom['symop'] = self.atom['symop'].str.strip()
            try:
                self.atom['utag'] = _add_unique_tags(self.atom)
            except ValueError:
                pass


[docs]    def parse_basis_set_order(self):
        bso = np.array(self._hdf['BASIS_FUNCTION_IDS'])
        df = {'center': bso[:, 0] - 1,
               'shell': bso[:, 1] - 1,
                   'L': bso[:, 2],
               'frame': 0}
        if bso.shape[1] == 4:
            df['ml'] = bso[:, 3]
        else:
            df['l'] = bso[:, 3]
            df['m'] = bso[:, 4]
            df['n'] = bso[:, 5]
        self.basis_set_order = pd.DataFrame.from_dict(df)

[docs]    def parse_orbital(self):
        ens = np.array(self._hdf['MO_ENERGIES'])
        self.orbital = pd.DataFrame.from_dict({
            'energy': ens, 'vector': range(len(ens)),
            'occupation': np.array(self._hdf['MO_OCCUPATIONS']),
            'label': pd.Series(self._hdf['MO_TYPEINDICES']).apply(
                               lambda s: s.decode('utf-8')),
            'frame': 0, 'group': 0, 'spin': 0})

[docs]    def parse_overlap(self):
        if 'symmetrized' not in self.meta: self.parse_atom()
        key = 'AO_OVERLAP_MATRIX'
        if not self.meta['symmetrized']:
            self.overlap = Overlap.from_column(
                _flat_square_to_triangle(np.array(self._hdf[key])))
        else:
            print('Symmetrized overlap indices not set correctly.')
            self.overlap = Overlap.from_dict({
                'coef': np.array(self._hdf[key]),
                'chi0': 0, 'chi1': 0, 'frame': 0
            })

[docs]    def parse_basis_set(self):
        if 'symmetrized' not in self.meta: self.parse_atom()
        bset = np.array(self._hdf['PRIMITIVES'])
        idxs = np.array(self._hdf['PRIMITIVE_IDS'])
        bset = pd.DataFrame.from_dict({
            'alpha': bset[:, 0], 'd': bset[:, 1],
            'center': idxs[:, 0] - 1, 'L': idxs[:, 1],
            'shell': idxs[:, 2] - 1
        })
        self.no_dup = bset
        if self.meta['symmetrized']:
            self.basis_set, atommap = deduplicate_basis_sets(bset)
            self.atom['set'] = self.atom['center'].map(atommap)
        else:
            self.basis_set = bset.rename(columns={'center': 'set'})



[docs]    def parse_momatrix(self):
        if 'MO_VECTORS' not in self._hdf: return
        coefs = np.array(self._hdf['MO_VECTORS'])
        try:
            symm = np.array(self._hdf['DESYM_MATRIX'])
            print('Symmetry not supported on HDF yet.')
            return
        except KeyError:
            dim = np.int64(np.sqrt(coefs.shape[0]))
            self.momatrix = pd.DataFrame.from_dict({
                'orbital': np.repeat(range(dim), dim), 'frame': 0,
                'chi': np.tile(range(dim), dim), 'coef': coefs})

    def __getitem__(self, key):
        if isinstance(key, str):
            return getattr(self, key)
        raise KeyError()

    def __init__(self, *args, **kwargs):
        try:
            import h5py
        except ImportError:
            print("You must install h5py for access to HDF5 utilities.")
            return
        if not os.path.isfile(args[0]):
            print('Argument is likely incorrect file path.')
        self._hdf = h5py.File(args[0], 'r')
        self.meta = {'gaussian': True, 'program': 'molcas'}


def _add_unique_tags(atom):
    """De-duplicates atom identifier in symmetrized calcs."""
    utags = []
    for tag in atom['tag']:
        utag = tag
        while utag in utags:
            utag = ''.join(filter(str.isalpha, utag)) + \
            str(int(''.join(filter(str.isdigit, utag))) + 1)
        utags.append(utag)
    return utags


[docs]def parse_molcas(fp, momatrix=None, overlap=None, occvec=None, **kwargs):
    """
    Will parse a Molcas output file. Optionally it will attempt
    to parse additional information obtained from the same directory
    from specified Orb files or the AO overlap matrix and density matrix.
    If density keyword is specified, the momatrix keyword is ignored.

    Args:
        fp (str): Path to output file
        momatrix (str): file name of the C matrix of interest
        overlap (str): file name of the overlap matrix
        occvec (str): an occupation vector

    Returns:
        parsed (Editor): contains many attributes similar to the
            exatomic universe
    """
    uni = Output(fp, **kwargs)
    adir = os.sep.join(fp.split(os.sep)[:-1])
    if momatrix is not None:
        fp = os.sep.join([adir, momatrix])
        if os.path.isfile(fp):
            orb = Orb(fp)
            uni.momatrix = orb.momatrix
            uni.occupation_vector = orb.occupation_vector
            occvec = occvec if occvec is not None else orb.occupation_vector
            d = DensityMatrix.from_momatrix(orb.momatrix, occvec)
            uni.density = d
        else:
            print('Is {} in the same directory as {}?'.format(momatrix, fp))
    if overlap is not None:
        fp = os.sep.join([adir, overlap])
        if os.path.isfile(fp): uni.overlap = Overlap.from_file(fp)
        else: print('Is {} in the same directory as {}?'.format(overlap, fp))
    return uni