pyiron · ligerzero-ai · Jul 7, 2024 · ahmedabdelkawy · Jul 12, 2024 · ahmedabdelkawy
diff --git a/pyiron_vasp/vasp/2024_07_07_BenchmarkVaspParser_Cleaned.ipynb b/pyiron_vasp/vasp/2024_07_07_BenchmarkVaspParser_Cleaned.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from pymatgen.io.vasp.outputs import Vasprun, Outcar\n",
+    "from pymatgen.io.ase import AseAtomsAdaptor\n",
+    "from ase.io import read\n",
+    "from pymatgen.io.vasp import Chgcar, Potcar\n",
+    "\n",
+    "path = \"/root/github_dev/github_pyiron/pyiron_atomistics/tests/static/vasp_test_files/full_job_sample\"\n",
+    "\n",
+    "def parse_vasp_output_pymatgen(path):\n",
+    "    # Parse the vasprun.xml file\n",
+    "    vr = Vasprun(filename=f\"{path}/vasprun.xml\",\n",
+    "                 parse_projected_eigen=True,\n",
+    "                 separate_spins=False)\n",
+    "\n",
+    "    # Initialize the output dictionary\n",
+    "    output = {}\n",
+    "    output[\"generic\"] = {}\n",
+    "    output[\"description\"] = 'This contains all the output static from this particular VASP run'\n",
+    "\n",
+    "    # 1. Get ASE atoms object via pymatgen\n",
+    "    output[\"structure\"] = AseAtomsAdaptor.get_atoms(vr.final_structure)\n",
+    "\n",
+    "    # # 2. Get ASE atoms object from ASE\n",
+    "    # output[\"structure\"] = read(f\"{path}/CONTCAR\")\n",
+    "\n",
+    "    # 3. Get charge density from CHGCAR\n",
+    "    # For charge density, I'm not too clear on the details of what exactly is happening,\n",
+    "    # but it appears it is spawned from pymatgen's parser originally anyway (per their comments in vasp.volumetric_data)\n",
+    "    output[\"charge_density\"] = Chgcar.from_file(f\"{path}/CHGCAR\")\n",
+    "\n",
+    "    # 4. Extract generic information\n",
+    "    output[\"generic\"][\"temperature\"] = vr.parameters['TEBEG'] # this is just wrong in pyiron's current parsing. defaults to 0, is not 0.\n",
+    "    # this is NOT actually 0, vasp does calculations at finite temperatures, even in DFT. This is for faster convergence - 0.0001 K is the default\n",
+    "\n",
+    "    # Here we use the convention that positive stress is compressive, as is done in ASE\n",
+    "    # Units here are eV/A^3\n",
+    "    output[\"generic\"][\"stresses\"] = [-np.array(step[\"stress\"])/1600.21766208 for step in vr.ionic_steps]\n",
+    "    # Don't use pressures entry anymore, I have no idea what it even is - just for the love of god, just let it die!!!\n",
+    "    output[\"generic\"][\"pressures\"] = output[\"generic\"][\"stresses\"]\n",
+    "    output[\"generic\"][\"forces\"] = [step['forces'] for step in vr.ionic_steps]\n",
+    "    # Similarly to pressures, the cell object in generic interface just needs to die - all useful information is in the structure\n",
+    "    # The edge case is where structures are extremely large and might not fit into memory, but cells do,\n",
+    "    # but it's so niche and pollutes the interface for no good reason. \n",
+    "    # I feel like most users aren't doing calculations with structures that don't fit into memory\n",
+    "    output[\"generic\"][\"cells\"] = [AseAtomsAdaptor.get_atoms(step[\"structure\"]).cell for step in vr.ionic_steps]\n",
+    "    # See above - is this necessary?\n",
+    "    output[\"generic\"][\"volume\"] = [step[\"structure\"].lattice.volume for step in vr.ionic_steps]\n",
+    "    # output_pyiron[\"generic\"][\"energy_pot\"] This is e_fr_energy in pymatgen\n",
+    "    # in pymatgen\n",
+    "    # 'e_fr_energy': -17.73798679,\n",
+    "    #  'e_wo_entrp': -17.72353582,\n",
+    "    #  'e_0_energy': -17.7331698}\n",
+    "    # pyiron: \n",
+    "    # output_pyiron[\"generic\"][\"energy_pot\"] -> array([-17.73798679])\n",
+    "    # output_pyiron[\"generic\"][\"energy_tot\"] -> array([-17.73798679])\n",
+    "    output[\"generic\"][\"energy_pot\"] = [step['electronic_steps'][-1]['e_wo_entrp'] for step in vr.ionic_steps]\n",
+    "    # output_pyiron[\"generic\"][\"energy_tot\"] This is also equivalent to e_fr_energy in pymatgen (e_tot = e_pot (!?))\n",
+    "    output[\"generic\"][\"energy_tot\"] = [step['electronic_steps'][-1]['e_fr_energy'] for step in vr.ionic_steps]\n",
+    "    # output[\"generic\"][\"energy_tot\"] = [step['electronic_steps'][-1]['e_0_energy'] for step in vasprun.ionic_steps] # energy sigma-> 0 (\"real\" free energy at 0K)\n",
+    "\n",
+    "    # For some reason steps is acquired by calling np.arange(len(steps)) in current pyiron parser. Surely returning a list when \n",
+    "    # the expected value is an integer is nonsensical? arange also generates 0 in the event of len=1...\n",
+    "    output[\"generic\"][\"steps\"] = len(vr.ionic_steps)\n",
+    "    # Another grievous offender that pollutes the interface for no good reason...\n",
+    "    # introducing \"positions\", when \"structures\" is present in output is clearly enough. Doubling storage in hdf for no new information is just nasty work\n",
+    "    output[\"generic\"][\"positions\"] = [AseAtomsAdaptor.get_atoms(step[\"structure\"]).positions for step in vr.ionic_steps]\n",
+    "\n",
+    "    # 5. Read elastic constants from OUTCAR\n",
+    "    try:\n",
+    "        elastic_constants = Outcar(filename=f\"{path}/OUTCAR\").read_elastic_tensor()\n",
+    "    except Exception as e:\n",
+    "        elastic_constants = None\n",
+    "    output[\"generic\"][\"elastic_constants\"] = elastic_constants\n",
+    "\n",
+    "    # 6. Extract DFT information\n",
+    "    # Notes: here is the limitations of pymatgen parsing - they only parse the final step.\n",
+    "    outcar = Outcar(filename=f\"{path}/OUTCAR\")\n",
+    "    output[\"generic\"][\"dft\"] = {}\n",
+    "    output[\"generic\"][\"dft\"][\"n_elect\"] = vr.parameters[\"NELECT\"]\n",
+    "\n",
+    "    # NOTE: Ahmed look here (?) probably needs to just use our own parser\n",
+    "    output[\"generic\"][\"dft\"][\"potentiostat_output\"] = [] \n",
+    "\n",
+    "    output[\"generic\"][\"dft\"][\"magnetization\"] = [outcar.magnetization]\n",
+    "    # pymatgen Outcar parser alternative only allows us to parse last step, but does not affect final_magmoms here\n",
+    "    output[\"generic\"][\"dft\"][\"final_magmoms\"] = [ionic_entry[\"tot\"] for ionic_entry in outcar.magnetization]\n",
+    "    # pymatgen only allows the calculation of the fermi energy on the last step\n",
+    "    output[\"generic\"][\"dft\"][\"e_fermi_list\"] = np.array(vr.calculate_efermi())\n",
+    "    # pymatgen only allows the calculation of the valence band maximum on the last step\n",
+    "    # NOTE: SERIOUS PROBLEMS - VBM CALCULATION IS DIFFERENT IN PYIRON VS PYMATGEN - (I (Han) have no idea :))\n",
+    "    output[\"generic\"][\"dft\"][\"vbm_list\"] = vr.eigenvalue_band_properties[1]\n",
+    "    # pymatgen only allows the calculation of the conduction band minimum on the last step\n",
+    "    # NOTE: SERIOUS PROBLEMS - CBM CALCULATION IS DIFFERENT IN PYIRON VS PYMATGEN - (I (Han) have no idea :))\n",
+    "    output[\"generic\"][\"dft\"][\"cbm_list\"] = vr.eigenvalue_band_properties[2]\n",
+    "    # pymatgen only allows parsing of the final total energy contribution...\n",
+    "    output[\"generic\"][\"dft\"][\"ediel_sol\"] = outcar.final_energy_contribs[\"Ediel_sol\"]\n",
+    " \n",
+    "\n",
+    "    output[\"generic\"][\"dft\"][\"ediel_sol\"] = outcar.final_energy_contribs[\"Ediel_sol\"]\n",
+    "    output[\"generic\"][\"dft\"][\"potentiostat_output\"] = []\n",
+    "\n",
+    "    # Read POTCAR file for valence charges - no support in pymatgen for extracting valence from vasprun (although possible)\n",
+    "    # NOTE: Commented out because we don't ahve valid POTCAR for testing here\n",
+    "    # potcar = Potcar.from_file(f\"{path}/POTCAR\")\n",
+    "    #output[\"generic\"][\"dft\"][\"valence_charges\"] = [ps.nelectrons for ps in potcar]\n",
+    "\n",
+    "    # SCF energies\n",
+    "    output[\"generic\"][\"dft\"][\"scf_dipole_mom\"] = [] # TODO: Likely needs to default to OUTCAR parsing via pyiron OUTCAR\n",
+    "    output[\"generic\"][\"dft\"][\"scf_energy_int\"] = [step['electronic_steps'][-1]['e_fr_energy'] for step in vr.ionic_steps]\n",
+    "    output[\"generic\"][\"dft\"][\"scf_energy_free\"] = [step['electronic_steps'][-1]['e_fr_energy'] for step in vr.ionic_steps]\n",
+    "    output[\"generic\"][\"dft\"][\"scf_energy_zero\"] = [step['electronic_steps'][-1]['e_0_energy'] for step in vr.ionic_steps]\n",
+    "    output[\"generic\"][\"dft\"][\"energy_int\"] = [step['electronic_steps'][-1]['e_fr_energy'] for step in vr.ionic_steps]\n",
+    "    output[\"generic\"][\"dft\"][\"energy_free\"] = [step['electronic_steps'][-1]['e_fr_energy'] for step in vr.ionic_steps]\n",
+    "    output[\"generic\"][\"dft\"][\"energy_zero\"] = [step['electronic_steps'][-1]['e_0_energy'] for step in vr.ionic_steps]\n",
+    "\n",
+    "    # 7. Extract band structure and DOS\n",
+    "    output[\"generic\"][\"dft\"][\"bands\"] = {}\n",
+    "    # NOTE: I'm in favour of just doing:\n",
+    "    # output[\"generic\"][\"dft\"][\"bands\"] = vr.get_band_structure().as_dict()\n",
+    "    # BUT if you want to keep the prior structured output\n",
+    "    vr_bs_dict = vr.get_band_structure().as_dict()\n",
+    "    output[\"generic\"][\"dft\"][\"bands\"][\"k_points\"] = vr.actual_kpoints\n",
+    "    return output\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"/root/github_dev/github_pyiron/pyiron_atomistics/tests/static/vasp_test_files/full_job_sample\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda3/envs/pyiron/lib/python3.12/site-packages/pymatgen/io/vasp/outputs.py:1185: UserWarning: No POTCAR file with matching TITEL fields was found in\n",
+      "/root/github_dev/github_pyiron/pyiron_atomistics/tests/static/vasp_test_files/full_job_sample/POTCAR\n",
+      "  warnings.warn(\"No POTCAR file with matching TITEL fields was found in\\n\" + \"\\n  \".join(potcar_paths))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "33.4 ms ± 665 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n100\n",
+    "output = parse_vasp_output_pymatgen(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda3/envs/pyiron/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyiron_atomistics.vasp.output import parse_vasp_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "21.3 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n100\n",
+    "output_pyiron = parse_vasp_output(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_real = \"/root/github_dev/dev_vasp_scraper/As-30-d-2.5\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.34 s ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n1\n",
+    "output = parse_vasp_output_pymatgen(path_real)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.92 s ± 61.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n1\n",
+    "output_pyiron = parse_vasp_output(path_real)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyiron",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}