diff --git a/Makefile b/Makefile index ff5b1e7091..4fd3dfbc35 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ help: @echo " install-kinbot Install KinBot" @echo " install-sella Install Sella" @echo " install-xtb Install xTB" + @echo " install-crest Install CREST" @echo " install-torchani Install TorchANI" @echo " install-ob Install OpenBabel" @echo "" @@ -100,6 +101,9 @@ install-sella: install-xtb: bash $(DEVTOOLS_DIR)/install_xtb.sh +install-crest: + bash $(DEVTOOLS_DIR)/install_crest.sh + install-torchani: bash $(DEVTOOLS_DIR)/install_torchani.sh diff --git a/arc/constants.pxd b/arc/constants.pxd index 67b3f412bb..9fc3b9127d 100644 --- a/arc/constants.pxd +++ b/arc/constants.pxd @@ -1 +1 @@ -cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, bohr_to_angstrom, E_h, F, E_h_kJmol +cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom, angstrom_to_bohr diff --git a/arc/constants.py b/arc/constants.py index 83f99051d1..6923a0f16b 100644 --- a/arc/constants.py +++ b/arc/constants.py @@ -79,6 +79,9 @@ #: Vacuum permittivity epsilon_0 = 8.8541878128 +bohr_to_angstrom = 0.529177 +angstrom_to_bohr = 1 / bohr_to_angstrom + # Cython does not automatically place module-level variables into the module # symbol table when in compiled mode, so we must do this manually so that we # can use the constants from both Cython and regular Python code @@ -101,4 +104,5 @@ 'F': F, 'epsilon_0': epsilon_0, 'bohr_to_angstrom': bohr_to_angstrom, + 'angstrom_to_bohr': angstrom_to_bohr, }) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index de8c747718..962da3c9b3 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -96,6 +96,7 @@ class JobEnum(str, Enum): # TS search methods autotst = 'autotst' # AutoTST, 10.1021/acs.jpca.7b07361, 10.26434/chemrxiv.13277870.v2 heuristics = 'heuristics' # ARC's heuristics + crest = 'crest' # CREST conformer/TS search kinbot = 'kinbot' # KinBot, 10.1016/j.cpc.2019.106947 gcn = 'gcn' # Graph neural network for isomerization, https://doi.org/10.1021/acs.jpclett.0c00500 user = 'user' # user guesses @@ -780,8 +781,7 @@ def _get_additional_job_info(self): content += '\n' else: raise ValueError(f'Unrecognized cluster software: {cluster_soft}') - if content: - self.additional_job_info = content.lower() + self.additional_job_info = content.lower() if content else None def _check_job_server_status(self) -> str: """ @@ -801,6 +801,10 @@ def _check_job_ess_status(self): Raises: IOError: If the output file and any additional server information cannot be found. """ + existing_keywords = list(self.job_status[1].get('keywords', list())) + # Refresh scheduler-side logs before ESS parsing so server-reported OOMs + # can be detected even when the output file is absent or incomplete. + self._get_additional_job_info() if self.server != 'local' and self.execution_type != 'incore': if os.path.exists(self.local_path_to_output_file): os.remove(self.local_path_to_output_file) @@ -839,7 +843,22 @@ def _check_job_ess_status(self): ) else: status, keywords, error, line = '', '', '', '' + if self.additional_job_info: + try: + status, keywords, error, line = determine_ess_status( + output_path=self.local_path_to_output_file, + species_label=self.species_label, + job_type=self.job_type, + job_log=self.additional_job_info, + software=self.job_adapter, + ) + except FileNotFoundError: + status, keywords, error, line = '', '', '', '' self.job_status[1]['status'] = status + if 'max_total_job_memory' in existing_keywords and status == 'errored' \ + and isinstance(keywords, list) and 'Memory' in keywords \ + and 'max_total_job_memory' not in keywords: + keywords.append('max_total_job_memory') self.job_status[1]['keywords'] = keywords self.job_status[1]['error'] = error self.job_status[1]['line'] = line.rstrip() diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 9657f9a62a..709dd67fe1 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -180,6 +180,12 @@ def setUpClass(cls): server='server3', testing=True, ) + os.makedirs(cls.job_5.local_path, exist_ok=True) + fixture_path = os.path.join(ARC_TESTING_PATH, 'trsh', 'wall_exceeded.txt') + with open(fixture_path, 'r') as f: + log_content = f.read() + with open(os.path.join(cls.job_5.local_path, 'out.txt'), 'w') as f: + f.write(log_content) cls.job_6 = GaussianAdapter(execution_type='queue', job_name='opt_101', job_type='opt', @@ -248,6 +254,24 @@ def test_set_cpu_and_mem(self): self.assertEqual(self.job_4.submit_script_memory, expected_memory) self.job_4.server = 'local' + def test_set_cpu_and_mem_marks_max_total_job_memory(self): + """Test tagging jobs whose requested memory is clipped to the node cap.""" + job = GaussianAdapter(execution_type='queue', + job_type='opt', + level=Level(method='cbs-qb3'), + project='test', + project_directory=os.path.join(ARC_TESTING_PATH, 'test_JobAdapter'), + species=[ARCSpecies(label='spc1', xyz=['O 0 0 1'])], + server='server2', + job_memory_gb=300, + testing=True, + ) + + job.set_cpu_and_mem() + + self.assertAlmostEqual(job.job_memory_gb, 256 * 0.95) + self.assertIn('max_total_job_memory', job.job_status[1]['keywords']) + def test_set_file_paths(self): """Test setting up the job's paths""" self.assertEqual(self.job_1.local_path, os.path.join(self.job_1.project_directory, 'calcs', 'Species', @@ -321,6 +345,42 @@ def test_determine_job_status(self): self.assertEqual(self.job_5.job_status[1]['status'], 'errored') self.assertEqual(self.job_5.job_status[1]['keywords'], ['ServerTimeLimit']) + @patch('arc.job.adapter.determine_ess_status') + def test_preserve_max_total_job_memory_keyword(self, mock_determine_ess_status): + """Test preserving the max_total_job_memory marker across ESS status parsing.""" + self.job_4.job_status[1]['keywords'] = ['max_total_job_memory'] + self.job_4.initial_time = datetime.datetime.now() - datetime.timedelta(minutes=2) + self.job_4.final_time = datetime.datetime.now() - datetime.timedelta(minutes=1) + os.makedirs(self.job_4.local_path, exist_ok=True) + with open(self.job_4.local_path_to_output_file, 'w') as f: + f.write('dummy output') + mock_determine_ess_status.return_value = ( + 'errored', + ['MDCI', 'Memory'], + 'Insufficient job memory.', + 'Please increase MaxCore', + ) + + self.job_4._check_job_ess_status() + + self.assertEqual(self.job_4.job_status[1]['status'], 'errored') + self.assertEqual(self.job_4.job_status[1]['keywords'], ['MDCI', 'Memory', 'max_total_job_memory']) + + def test_check_job_ess_status_without_output_uses_job_log_memory_error(self): + """Test detecting server-reported memory errors even when the output file is absent.""" + if os.path.isfile(self.job_4.local_path_to_output_file): + os.remove(self.job_4.local_path_to_output_file) + self.job_4.initial_time = datetime.datetime.now() - datetime.timedelta(minutes=2) + self.job_4.final_time = datetime.datetime.now() - datetime.timedelta(minutes=1) + self.job_4.additional_job_info = '\tMEMORY EXCEEDED\n' + + with patch.object(self.job_4, '_get_additional_job_info'): + self.job_4._check_job_ess_status() + + self.assertEqual(self.job_4.job_status[1]['status'], 'errored') + self.assertEqual(self.job_4.job_status[1]['keywords'], ['Memory']) + self.assertEqual(self.job_4.job_status[1]['error'], 'Insufficient job memory.') + @patch( "arc.job.trsh.servers", { diff --git a/arc/job/adapters/orca.py b/arc/job/adapters/orca.py index 71e6334f2c..01d0ceeef5 100644 --- a/arc/job/adapters/orca.py +++ b/arc/job/adapters/orca.py @@ -84,13 +84,14 @@ def _format_orca_basis(basis: str) -> str: # job_type_2: reserved for Opt + Freq. # restricted: 'R' = closed-shell SCF, 'U' = spin unrestricted SCF, 'RO' = open-shell spin restricted SCF # auxiliary_basis: required for DLPNO calculations (speed up calculation) +# cabs: Complementary Auxiliary Basis Set for F12 calculations (e.g., cc-pVTZ-F12-CABS) # memory: MB per core (must increase as system gets larger) # cpus: must be less than number of electron pairs, defaults to min(heavy atoms, cpus limit) # job_options_blocks: input blocks that enable detailed control over program # job_options_keywords: input keywords that control the job # method_class: 'HF' for wavefunction methods (hf, mp, cc, dlpno ...). 'KS' for DFT methods. # options: additional keywords to control job (e.g., TightSCF, NormalPNO ...) -input_template = """!${restricted}${method_class} ${method} ${basis} ${auxiliary_basis} ${keywords} +input_template = """!${restricted}${method_class} ${method} ${basis} ${auxiliary_basis}${cabs} ${keywords} !${job_type_1} ${job_type_2} %%maxcore ${memory} @@ -254,6 +255,12 @@ def write_input_file(self) -> None: """ Write the input file to execute the job on the server. """ + if 'f12' in self.level.method and not self.level.cabs: + raise ValueError( + f"Level '{self.level}' uses an F12 method without a CABS basis. " + f"Set `cabs:` in the level spec (e.g. cc-pVTZ-F12-CABS). " + f"Without it ORCA runs with DimCABS = 0 and returns non-F12 energies." + ) input_dict = dict() for key in ['block', 'scan', @@ -264,6 +271,7 @@ def write_input_file(self) -> None: input_dict[key] = '' input_dict['auxiliary_basis'] = _format_orca_basis(self.level.auxiliary_basis or '') input_dict['basis'] = _format_orca_basis(self.level.basis or '') + input_dict['cabs'] = f' {_format_orca_basis(self.level.cabs)}' if self.level.cabs else '' input_dict['charge'] = self.charge input_dict['cpus'] = self.cpu_cores input_dict['label'] = self.species_label @@ -272,30 +280,28 @@ def write_input_file(self) -> None: input_dict['multiplicity'] = self.multiplicity input_dict['xyz'] = xyz_to_str(self.xyz) - scf_convergence = self.args['keyword'].get('scf_convergence', '').lower() or \ - orca_default_options_dict['global']['keyword'].get('scf_convergence', '').lower() - if not scf_convergence: + self.args['keyword'].setdefault( + 'scf_convergence', + orca_default_options_dict['global']['keyword'].get('scf_convergence', '').lower()) + if not self.args['keyword']['scf_convergence']: raise ValueError('Orca SCF convergence is not specified. Please specify this variable either in ' 'settings.py as default or in the input file as additional options.') - self.add_to_args(val=scf_convergence, key1='keyword') # Orca requires different blocks for wavefunction methods and DFT methods if self.level.method_type == 'dft': input_dict['method_class'] = 'KS' - # DFT grid must be the same for both opt and freq - if self.fine: - self.add_to_args(val='defgrid3', key1='keyword') - else: - self.add_to_args(val='defgrid2', key1='keyword') + # DFT grid must be the same for both opt and freq. + # Users can override by setting `dft_grid` in args.keyword (e.g. dft_grid: DEFGRID1). + self.args['keyword'].setdefault('dft_grid', 'defgrid3' if self.fine else 'defgrid2') elif self.level.method_type == 'wavefunction': input_dict['method_class'] = 'HF' if 'dlpno' in self.level.method: - dlpno_threshold = self.args['keyword'].get('dlpno_threshold', '').lower() or \ - orca_default_options_dict['global']['keyword'].get('dlpno_threshold', '').lower() - if not dlpno_threshold: + self.args['keyword'].setdefault( + 'dlpno_threshold', + orca_default_options_dict['global']['keyword'].get('dlpno_threshold', '').lower()) + if not self.args['keyword']['dlpno_threshold']: raise ValueError('Orca DLPNO threshold is not specified. Please specify this variable either in ' 'settings.py as default or in the input file as additional options.') - self.add_to_args(val=dlpno_threshold, key1='keyword') else: logger.debug(f'Running {self.level.method_type} {self.level.method} method in Orca.') diff --git a/arc/job/adapters/orca_test.py b/arc/job/adapters/orca_test.py index 4b7725da51..df63d135f9 100644 --- a/arc/job/adapters/orca_test.py +++ b/arc/job/adapters/orca_test.py @@ -99,6 +99,34 @@ def test_set_input_file_memory(self): expected_memory = math.ceil(14 * 1024 / 8) self.assertEqual(self.job_1.input_file_memory, expected_memory) + def test_set_input_file_memory_with_configured_core_count(self): + """Test ORCA %%maxcore calculation for a configured total memory and cpu count.""" + original_memory = self.job_1.job_memory_gb + original_cpu_cores = self.job_1.cpu_cores + self.job_1.job_memory_gb = 250 + self.job_1.cpu_cores = 22 + self.job_1.set_input_file_memory() + self.assertEqual(self.job_1.input_file_memory, math.ceil(250 * 1024 / 22)) + self.job_1.job_memory_gb = original_memory + self.job_1.cpu_cores = original_cpu_cores + self.job_1.set_input_file_memory() + + def test_write_input_file_with_configured_core_count(self): + """Test rendering ORCA input for a configured total memory and cpu count.""" + original_memory = self.job_1.job_memory_gb + original_cpu_cores = self.job_1.cpu_cores + self.job_1.job_memory_gb = 250 + self.job_1.cpu_cores = 22 + self.job_1.set_input_file_memory() + self.job_1.write_input_file() + with open(os.path.join(self.job_1.local_path, input_filenames[self.job_1.job_adapter]), 'r') as f: + content = f.read() + self.assertIn('%maxcore 11637', content) + self.assertIn('%pal nprocs 22 end', content) + self.job_1.job_memory_gb = original_memory + self.job_1.cpu_cores = original_cpu_cores + self.job_1.set_input_file_memory() + def test_write_input_file(self): """Test writing Orca input files""" self.job_1.write_input_file() diff --git a/arc/job/adapters/ts/__init__.py b/arc/job/adapters/ts/__init__.py index 5d571e8e80..4525adeed7 100644 --- a/arc/job/adapters/ts/__init__.py +++ b/arc/job/adapters/ts/__init__.py @@ -1,6 +1,8 @@ import arc.job.adapters.ts.autotst_ts +import arc.job.adapters.ts.crest import arc.job.adapters.ts.gcn_ts import arc.job.adapters.ts.heuristics import arc.job.adapters.ts.kinbot_ts +import arc.job.adapters.ts.seed_hub import arc.job.adapters.ts.xtb_gsm import arc.job.adapters.ts.orca_neb diff --git a/arc/job/adapters/ts/crest.py b/arc/job/adapters/ts/crest.py new file mode 100644 index 0000000000..6396a968da --- /dev/null +++ b/arc/job/adapters/ts/crest.py @@ -0,0 +1,520 @@ +""" +Utilities for running CREST within ARC. + +Separated from heuristics so CREST can be conditionally imported and reused. +""" + +import datetime +import os +import time +from typing import TYPE_CHECKING, List, Optional, Union + +from arc.common import almost_equal_coords, get_logger +from arc.imports import settings, submit_scripts +from arc.job.adapter import JobAdapter +from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family +from arc.job.adapters.ts.heuristics import DIHEDRAL_INCREMENT +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints +from arc.job.factory import register_job_adapter +from arc.job.local import check_job_status, submit_job +from arc.plotter import save_geo +from arc.species.converter import reorder_xyz_string, str_to_xyz, xyz_to_str +from arc.species.species import ARCSpecies, TSGuess + +if TYPE_CHECKING: + from arc.level import Level + from arc.reaction import ARCReaction + +logger = get_logger() + +MAX_CHECK_INTERVAL_SECONDS = 100 + +CREST_PATH = settings.get("CREST_PATH", None) +CREST_ENV_PATH = settings.get("CREST_ENV_PATH", None) +SERVERS = settings.get("servers", {}) + + +def crest_available() -> bool: + """ + Return whether CREST is configured for use. + """ + return bool(SERVERS.get("local")) and bool(CREST_PATH or CREST_ENV_PATH) + + +class CrestAdapter(JobAdapter): + """ + A class for executing CREST TS conformer searches based on heuristics-generated guesses. + """ + + def __init__(self, + project: str, + project_directory: str, + job_type: Union[List[str], str], + args: Optional[dict] = None, + bath_gas: Optional[str] = None, + checkfile: Optional[str] = None, + conformer: Optional[int] = None, + constraints: Optional[List] = None, + cpu_cores: Optional[str] = None, + dihedral_increment: Optional[float] = None, + dihedrals: Optional[List[float]] = None, + directed_scan_type: Optional[str] = None, + ess_settings: Optional[dict] = None, + ess_trsh_methods: Optional[List[str]] = None, + execution_type: Optional[str] = None, + fine: bool = False, + initial_time: Optional[Union['datetime.datetime', str]] = None, + irc_direction: Optional[str] = None, + job_id: Optional[int] = None, + job_memory_gb: float = 14.0, + job_name: Optional[str] = None, + job_num: Optional[int] = None, + job_server_name: Optional[str] = None, + job_status: Optional[List[Union[dict, str]]] = None, + level: Optional['Level'] = None, + max_job_time: Optional[float] = None, + run_multi_species: bool = False, + reactions: Optional[List['ARCReaction']] = None, + rotor_index: Optional[int] = None, + server: Optional[str] = None, + server_nodes: Optional[list] = None, + queue: Optional[str] = None, + attempted_queues: Optional[List[str]] = None, + species: Optional[List[ARCSpecies]] = None, + testing: bool = False, + times_rerun: int = 0, + torsions: Optional[List[List[int]]] = None, + tsg: Optional[int] = None, + xyz: Optional[dict] = None, + ): + + self.incore_capacity = 50 + self.job_adapter = 'crest' + self.command = None + self.execution_type = execution_type or 'incore' + + if reactions is None: + raise ValueError('Cannot execute TS CREST without ARCReaction object(s).') + + dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT + + _initialize_adapter(obj=self, + is_ts=True, + project=project, + project_directory=project_directory, + job_type=job_type, + args=args, + bath_gas=bath_gas, + checkfile=checkfile, + conformer=conformer, + constraints=constraints, + cpu_cores=cpu_cores, + dihedral_increment=dihedral_increment, + dihedrals=dihedrals, + directed_scan_type=directed_scan_type, + ess_settings=ess_settings, + ess_trsh_methods=ess_trsh_methods, + fine=fine, + initial_time=initial_time, + irc_direction=irc_direction, + job_id=job_id, + job_memory_gb=job_memory_gb, + job_name=job_name, + job_num=job_num, + job_server_name=job_server_name, + job_status=job_status, + level=level, + max_job_time=max_job_time, + run_multi_species=run_multi_species, + reactions=reactions, + rotor_index=rotor_index, + server=server, + server_nodes=server_nodes, + queue=queue, + attempted_queues=attempted_queues, + species=species, + testing=testing, + times_rerun=times_rerun, + torsions=torsions, + tsg=tsg, + xyz=xyz, + ) + + def write_input_file(self) -> None: + pass + + def set_files(self) -> None: + pass + + def set_additional_file_paths(self) -> None: + pass + + def set_input_file_memory(self) -> None: + pass + + def execute_incore(self): + self._log_job_execution() + self.initial_time = self.initial_time if self.initial_time else datetime.datetime.now() + + supported_families = [key for key, val in ts_adapters_by_rmg_family.items() if 'crest' in val] + + self.reactions = [self.reactions] if not isinstance(self.reactions, list) else self.reactions + for rxn in self.reactions: + if rxn.family not in supported_families: + logger.warning(f'The CREST TS search adapter does not support the {rxn.family} reaction family.') + continue + if any(spc.get_xyz() is None for spc in rxn.r_species + rxn.p_species): + logger.warning(f'The CREST TS search adapter cannot process a reaction if 3D coordinates of ' + f'some/all of its reactants/products are missing.\nNot processing {rxn}.') + continue + if not crest_available(): + logger.warning('CREST is not available. Skipping CREST TS search.') + break + + if rxn.ts_species is None: + rxn.ts_species = ARCSpecies(label='TS', + is_ts=True, + charge=rxn.charge, + multiplicity=rxn.multiplicity, + ) + + tsg = TSGuess(method='CREST') + tsg.tic() + + crest_job_dirs = [] + xyz_guesses = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + if not xyz_guesses: + logger.warning(f'CREST TS search failed to generate any seed guesses for {rxn.label}.') + tsg.tok() + continue + + for iteration, xyz_entry in enumerate(xyz_guesses): + xyz_guess = xyz_entry.get("xyz") + family = xyz_entry.get("family", rxn.family) + if xyz_guess is None: + continue + + crest_constraint_atoms = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=xyz_entry, + ) + if not crest_constraint_atoms: + logger.warning( + f"Could not determine CREST constraint atoms for {rxn.label} crest seed {iteration} " + f"(family: {family}). Skipping this CREST seed." + ) + continue + + crest_job_dir = crest_ts_conformer_search( + xyz_guess, + crest_constraint_atoms["A"], + crest_constraint_atoms["H"], + crest_constraint_atoms["B"], + path=self.local_path, + xyz_crest_int=iteration, + ) + crest_job_dirs.append(crest_job_dir) + + if not crest_job_dirs: + logger.warning(f'CREST TS search failed to prepare any jobs for {rxn.label}.') + tsg.tok() + continue + + crest_jobs = submit_crest_jobs(crest_job_dirs) + monitor_crest_jobs(crest_jobs) + xyz_guesses_crest = process_completed_jobs(crest_jobs) + tsg.tok() + + for method_index, xyz in enumerate(xyz_guesses_crest): + if xyz is None: + continue + unique = True + for other_tsg in rxn.ts_species.ts_guesses: + if almost_equal_coords(xyz, other_tsg.initial_xyz): + if hasattr(other_tsg, "method_sources"): + other_tsg.method_sources = other_tsg._normalize_method_sources( + (other_tsg.method_sources or []) + ["crest"] + ) + unique = False + break + if unique: + ts_guess = TSGuess(method='CREST', + index=len(rxn.ts_species.ts_guesses), + method_index=method_index, + t0=tsg.t0, + execution_time=tsg.execution_time, + success=True, + family=rxn.family, + xyz=xyz, + ) + rxn.ts_species.ts_guesses.append(ts_guess) + save_geo(xyz=xyz, + path=self.local_path, + filename=f'CREST_{method_index}', + format_='xyz', + comment=f'CREST {method_index}, family: {rxn.family}', + ) + + if len(self.reactions) < 5: + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'crest' in tsg.method.lower()] + if successes: + logger.info(f'CREST successfully found {len(successes)} TS guesses for {rxn.label}.') + else: + logger.info(f'CREST did not find any successful TS guesses for {rxn.label}.') + + self.final_time = datetime.datetime.now() + + def execute_queue(self): + self.execute_incore() + + +def crest_ts_conformer_search( + xyz_guess: dict, + a_atom: int, + h_atom: int, + b_atom: int, + path: str = "", + xyz_crest_int: int = 0, +) -> str: + """ + Prepare a CREST TS conformer search job: + - Write coords.ref and constraints.inp + - Write a PBS/HTCondor submit script using submit_scripts["local"]["crest"] + - Return the CREST job directory path + """ + path = os.path.join(path, f"crest_{xyz_crest_int}") + os.makedirs(path, exist_ok=True) + + # --- coords.ref --- + symbols = xyz_guess["symbols"] + converted_coords = reorder_xyz_string( + xyz_str=xyz_to_str(xyz_guess), + reverse_atoms=True, + convert_to="bohr", + ) + coords_ref_content = f"$coord\n{converted_coords}\n$end\n" + coords_ref_path = os.path.join(path, "coords.ref") + with open(coords_ref_path, "w") as f: + f.write(coords_ref_content) + + # --- constraints.inp --- + num_atoms = len(symbols) + # CREST uses 1-based indices + a_atom += 1 + h_atom += 1 + b_atom += 1 + + # All atoms not directly involved in A–H–B go into the metadynamics atom list + list_of_atoms_numbers_not_participating_in_reaction = [ + i for i in range(1, num_atoms + 1) if i not in [a_atom, h_atom, b_atom] + ] + + constraints_path = os.path.join(path, "constraints.inp") + with open(constraints_path, "w") as f: + f.write("$constrain\n") + f.write(f" atoms: {a_atom}, {h_atom}, {b_atom}\n") + f.write(" force constant: 0.5\n") + f.write(" reference=coords.ref\n") + f.write(f" distance: {a_atom}, {h_atom}, auto\n") + f.write(f" distance: {h_atom}, {b_atom}, auto\n") + f.write("$metadyn\n") + if list_of_atoms_numbers_not_participating_in_reaction: + f.write( + f' atoms: {", ".join(map(str, list_of_atoms_numbers_not_participating_in_reaction))}\n' + ) + f.write("$end\n") + + # --- build CREST command string --- + # Example: crest coords.ref --cinp constraints.inp --noreftopo -T 8 + local_server = SERVERS.get("local", {}) + cpus = int(local_server.get("cpus", 8)) + if CREST_ENV_PATH: + crest_exe = "crest" + else: + crest_exe = CREST_PATH if CREST_PATH is not None else "crest" + + commands = [ + crest_exe, + "coords.ref", + "--cinp constraints.inp", + "--noreftopo", + f"-T {cpus}", + ] + command = " ".join(commands) + + # --- activation line (optional) --- + activation_line = CREST_ENV_PATH or "" + + if SERVERS.get("local") is not None: + cluster_soft = SERVERS["local"]["cluster_soft"].lower() + local_templates = submit_scripts.get("local", {}) + crest_template = local_templates.get("crest") + crest_job_template = local_templates.get("crest_job") + + if cluster_soft in ["condor", "htcondor"]: + # HTCondor branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "universe = vanilla\n" + "executable = job.sh\n" + "output = out.txt\n" + "error = err.txt\n" + "log = log.txt\n" + "request_cpus = {cpus}\n" + "request_memory = {memory}\n" + "JobBatchName = {name}\n" + "queue\n" + ) + if crest_job_template is None: + crest_job_template = ( + "#!/bin/bash -l\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + "memory": int(SERVERS["local"].get("memory", 32.0) * 1024), + } + sub_job = sub_job.format(**format_params) + + with open( + os.path.join(path, settings["submit_filenames"]["HTCondor"]), "w" + ) as f: + f.write(sub_job) + + crest_job = crest_job_template.format( + path=path, + activation_line=activation_line, + commands=command, + ) + + with open(os.path.join(path, "job.sh"), "w") as f: + f.write(crest_job) + os.chmod(os.path.join(path, "job.sh"), 0o700) + + # Pre-create out/err for any status checkers that expect them + for fname in ("out.txt", "err.txt"): + fpath = os.path.join(path, fname) + if not os.path.exists(fpath): + with open(fpath, "w") as f: + f.write("") + os.chmod(fpath, 0o600) + + elif cluster_soft == "pbs": + # PBS branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "#!/bin/bash -l\n" + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + "#PBS -o out.txt\n" + "#PBS -e err.txt\n\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "queue": SERVERS["local"].get("queue", "alon_q"), + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + # 'memory' is in GB for the template: mem={memory}gb + "memory": int( + SERVERS["local"].get("memory", 32) + if SERVERS["local"].get("memory", 32) < 60 + else 40 + ), + "activation_line": activation_line, + "path": path, + "commands": command, + } + sub_job = sub_job.format(**format_params) + + submit_filename = settings["submit_filenames"]["PBS"] # usually 'submit.sh' + submit_path = os.path.join(path, submit_filename) + with open(submit_path, "w") as f: + f.write(sub_job) + os.chmod(submit_path, 0o700) + + else: + raise ValueError(f"Unsupported cluster_soft for CREST: {cluster_soft!r}") + + return path + + +def submit_crest_jobs(crest_paths: List[str]) -> dict: + """ + Submit CREST jobs to the server. + + Args: + crest_paths (List[str]): List of paths to the CREST directories. + + Returns: + dict: A dictionary containing job IDs as keys and their statuses as values. + """ + crest_jobs = {} + for crest_path in crest_paths: + job_status, job_id = submit_job(path=crest_path) + logger.info(f"CREST job {job_id} submitted for {crest_path}") + crest_jobs[job_id] = {"path": crest_path, "status": job_status} + return crest_jobs + + +def monitor_crest_jobs(crest_jobs: dict, check_interval: int = 300) -> None: + """ + Monitor CREST jobs until they are complete. + + Args: + crest_jobs (dict): Dictionary containing job information (job ID, path, and status). + check_interval (int): Time interval (in seconds) to wait between status checks. + """ + while True: + all_done = True + for job_id, job_info in crest_jobs.items(): + if job_info["status"] not in ["done", "failed"]: + try: + job_info["status"] = check_job_status(job_id) # Update job status + except Exception as e: + logger.error(f"Error checking job status for job {job_id}: {e}") + job_info["status"] = "failed" + if job_info["status"] not in ["done", "failed"]: + all_done = False + if all_done: + break + time.sleep(min(check_interval, MAX_CHECK_INTERVAL_SECONDS)) + + +def process_completed_jobs(crest_jobs: dict) -> list: + """ + Process the completed CREST jobs and update XYZ guesses. + + Args: + crest_jobs (dict): Dictionary containing job information. + """ + xyz_guesses = [] + for job_id, job_info in crest_jobs.items(): + crest_path = job_info["path"] + if job_info["status"] == "done": + crest_best_path = os.path.join(crest_path, "crest_best.xyz") + if os.path.exists(crest_best_path): + with open(crest_best_path, "r") as f: + content = f.read() + xyz_guess = str_to_xyz(content) + xyz_guesses.append(xyz_guess) + else: + logger.error(f"crest_best.xyz not found in {crest_path}") + elif job_info["status"] == "failed": + logger.error(f"CREST job failed for {crest_path}") + + return xyz_guesses + +register_job_adapter('crest', CrestAdapter) diff --git a/arc/job/adapters/ts/crest_test.py b/arc/job/adapters/ts/crest_test.py new file mode 100644 index 0000000000..e243d8d43d --- /dev/null +++ b/arc/job/adapters/ts/crest_test.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.job.adapters.ts.crest +""" + +import os +import tempfile +import unittest + +from arc.species.converter import str_to_xyz + + +class TestCrestAdapter(unittest.TestCase): + """ + Tests for CREST input generation. + """ + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.tmpdir.cleanup() + + def test_creates_valid_input_files(self): + """ + Ensure CREST inputs are written with expected content/format. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = { + "local": { + "crest": ( + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + ), + "crest_job": "{activation_line}\ncd {path}\n{commands}\n", + } + } + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=0 + ) + + coords_path = os.path.join(crest_dir, "coords.ref") + constraints_path = os.path.join(crest_dir, "constraints.inp") + submit_path = os.path.join(crest_dir, "submit.sh") + + self.assertTrue(os.path.exists(coords_path)) + self.assertTrue(os.path.exists(constraints_path)) + self.assertTrue(os.path.exists(submit_path)) + + with open(coords_path) as f: + coords = f.read().strip().splitlines() + self.assertEqual(coords[0].strip(), "$coord") + self.assertEqual(coords[-1].strip(), "$end") + self.assertEqual(len(coords) - 2, len(xyz["symbols"])) + + with open(constraints_path) as f: + constraints = f.read() + self.assertIn("atoms: 1, 2, 3", constraints) + self.assertIn("force constant: 0.5", constraints) + self.assertIn("reference=coords.ref", constraints) + self.assertIn("distance: 1, 2, auto", constraints) + self.assertIn("distance: 2, 3, auto", constraints) + self.assertIn("$metadyn", constraints) + self.assertTrue(constraints.strip().endswith("$end")) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + def test_creates_submit_file_without_crest_templates(self): + """ + Ensure fallback submit template generation works when submit.py has no CREST templates. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = {"local": {}} + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=1 + ) + + submit_path = os.path.join(crest_dir, "submit.sh") + self.assertTrue(os.path.exists(submit_path)) + with open(submit_path) as f: + submit_text = f.read() + self.assertIn("#PBS -q testq", submit_text) + self.assertIn("coords.ref --cinp constraints.inp --noreftopo -T 4", submit_text) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/job/adapters/ts/heuristics.py b/arc/job/adapters/ts/heuristics.py index 9031aa9ec3..8582735cab 100644 --- a/arc/job/adapters/ts/heuristics.py +++ b/arc/job/adapters/ts/heuristics.py @@ -21,28 +21,44 @@ import os from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from arc.common import (ARC_PATH, almost_equal_coords, get_angle_in_180_range, get_logger, is_angle_linear, - is_xyz_linear, key_by_val, read_yaml_file) +from arc.common import ( + ARC_PATH, + almost_equal_coords, + get_angle_in_180_range, + get_logger, + is_angle_linear, + is_xyz_linear, + key_by_val, + read_yaml_file, +) from arc.family import get_reaction_family_products from arc.job.adapter import JobAdapter from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family from arc.job.factory import register_job_adapter from arc.plotter import save_geo -from arc.species.converter import (compare_zmats, relocate_zmat_dummy_atoms_to_the_end, zmat_from_xyz, zmat_to_xyz, - add_atom_to_xyz_using_internal_coords, sorted_distances_of_atom) +from arc.species.converter import ( + add_atom_to_xyz_using_internal_coords, + compare_zmats, + relocate_zmat_dummy_atoms_to_the_end, + sorted_distances_of_atom, + zmat_from_xyz, + zmat_to_xyz, +) from arc.mapping.engine import map_two_species from arc.molecule.molecule import Molecule from arc.species.species import ARCSpecies, TSGuess, SpeciesError, colliding_atoms from arc.species.zmat import get_parameter_from_atom_indices, remove_zmat_atom_0, up_param, xyz_to_zmat from arc.species.vectors import calculate_angle +from arc.job.adapters.ts.seed_hub import get_ts_seeds if TYPE_CHECKING: from arc.level import Level from arc.reaction import ARCReaction - -FAMILY_SETS = {'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], - 'hydrolysis_set_2': ['nitrile_hydrolysis']} +FAMILY_SETS = { + 'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], + 'hydrolysis_set_2': ['nitrile_hydrolysis'], +} DIHEDRAL_INCREMENT = 30 @@ -258,56 +274,60 @@ def execute_incore(self): multiplicity=rxn.multiplicity, ) - xyzs = list() - tsg, families = None, None - if rxn.family == 'H_Abstraction': - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs = h_abstraction(reaction=rxn, dihedral_increment=self.dihedral_increment) - tsg.tok() - + tsg = TSGuess(method='Heuristics') + tsg.tic() + xyzs = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + tsg.tok() if rxn.family in FAMILY_SETS['hydrolysis_set_1'] or rxn.family in FAMILY_SETS['hydrolysis_set_2']: - try: - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs, families, indices = hydrolysis(reaction=rxn) - tsg.tok() - if not xyzs: - logger.warning(f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.') - continue - except ValueError: + if not xyzs: + logger.warning( + f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.' + ) continue - for method_index, xyz in enumerate(xyzs): + for method_index, xyz_entry in enumerate(xyzs): + xyz = xyz_entry.get("xyz") + method_label = xyz_entry.get("method", "Heuristics") + family = xyz_entry.get("family", rxn.family) + if xyz is None: + continue unique = True for other_tsg in rxn.ts_species.ts_guesses: if almost_equal_coords(xyz, other_tsg.initial_xyz): - if 'heuristics' not in other_tsg.method.lower(): - other_tsg.method += ' and Heuristics' + existing_sources = getattr(other_tsg, "method_sources", None) + if existing_sources is not None: + combined_sources = list(existing_sources) + [method_label] + else: + combined_sources = [other_tsg.method, method_label] + other_tsg.method_sources = TSGuess._normalize_method_sources(combined_sources) unique = False break if unique: - ts_guess = TSGuess(method='Heuristics', + ts_guess = TSGuess(method=method_label, index=len(rxn.ts_species.ts_guesses), method_index=method_index, t0=tsg.t0, execution_time=tsg.execution_time, success=True, - family=rxn.family if families is None else families[method_index], + family=family, xyz=xyz, ) rxn.ts_species.ts_guesses.append(ts_guess) save_geo(xyz=xyz, path=self.local_path, - filename=f'Heuristics_{method_index}', + filename=f'{method_label}_{method_index}', format_='xyz', - comment=f'Heuristics {method_index}, family: {rxn.family}', + comment=f'{method_label} {method_index}, family: {rxn.family}', ) if len(self.reactions) < 5: - successes = len([tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'heuristics' in tsg.method]) + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success] if successes: - logger.info(f'Heuristics successfully found {successes} TS guesses for {rxn.label}.') + logger.info(f'Heuristics successfully found {len(successes)} TS guesses for {rxn.label}.') else: logger.info(f'Heuristics did not find any successful TS guesses for {rxn.label}.') @@ -873,7 +893,7 @@ def h_abstraction(reaction: 'ARCReaction', dihedral_increment (int, optional): The dihedral increment to use for B-H-A-C and D-B-H-C dihedral scans. Returns: List[dict] - Entries are Cartesian coordinates of TS guesses for all reactions. + Entries hold Cartesian coordinates of TS guesses and the generating method label. """ xyz_guesses = list() dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT @@ -952,7 +972,8 @@ def h_abstraction(reaction: 'ARCReaction', else: # This TS is unique, and has no atom collisions. zmats.append(zmat_guess) - xyz_guesses.append(xyz_guess) + xyz_guesses.append({"xyz": xyz_guess, "method": "Heuristics"}) + return xyz_guesses @@ -987,9 +1008,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -999,9 +1022,19 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in } adjustments_to_try = [False, True] if dihedrals_to_change_num == 1 else [True] for adjust_dihedral in adjustments_to_try: - chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices(initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters,reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=adjust_dihedral) + chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=adjust_dihedral, + ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) if xyz_guesses: xyz_guesses_total.extend(xyz_guesses) @@ -1015,8 +1048,8 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in condition_met = len(xyz_guesses_total) > 0 nitrile_in_inputs = any( - (pd.get("family") == "nitrile_hydrolysis") or - (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) + (pd.get("family") == "nitrile_hydrolysis") + or (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) for pd in product_dicts ) nitrile_already_found = any(fam == "nitrile_hydrolysis" for fam in reaction_families) @@ -1032,9 +1065,11 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -1048,10 +1083,18 @@ def hydrolysis(reaction: 'ARCReaction') -> Tuple[List[dict], List[dict], List[in break dihedrals_to_change_num += 1 chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( - initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters, reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=True, - allow_nitrile_dihedrals=True + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=True, + allow_nitrile_dihedrals=True, ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) @@ -1083,11 +1126,13 @@ def get_products_and_check_families(reaction: 'ARCReaction') -> Tuple[List[dict] consider_arc_families=True, ) carbonyl_based_present = any( - "carbonyl_based_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "carbonyl_based_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) ether_present = any( - "ether_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "ether_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) @@ -1118,9 +1163,11 @@ def has_carbonyl_based_hydrolysis(reaction_families: List[dict]) -> bool: return any(family == "carbonyl_based_hydrolysis" for family in reaction_families) -def extract_reactant_and_indices(reaction: 'ARCReaction', - product_dict: dict, - is_set_1: bool) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: +def extract_reactant_and_indices( + reaction: 'ARCReaction', + product_dict: dict, + is_set_1: bool, +) -> Tuple[ARCSpecies, ARCSpecies, dict, dict]: """ Extract the reactant molecules and relevant atomic indices (a,b,e,d,o,h1) for the hydrolysis reaction. @@ -1163,11 +1210,13 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', main_reactant, a_xyz_index, b_xyz_index, - two_neighbors + two_neighbors, ) except ValueError as e: - raise ValueError(f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " - f"in species {main_reactant.label}: {e}") + raise ValueError( + f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " + f"in species {main_reactant.label}: {e}" + ) o_index = len(main_reactant.mol.atoms) h1_index = o_index + 1 @@ -1178,25 +1227,26 @@ def extract_reactant_and_indices(reaction: 'ARCReaction', "e": e_xyz_index, "d": d_xyz_indices, "o": o_index, - "h1": h1_index + "h1": h1_index, } return main_reactant, water, initial_xyz, xyz_indices -def process_chosen_d_indices(initial_xyz: dict, - base_xyz_indices: dict, - xyz_indices: dict, - hydrolysis_parameters: dict, - reaction_family: str, - water: 'ARCSpecies', - zmats_total: List[dict], - is_set_1: bool, - is_set_2: bool, - dihedrals_to_change_num: int, - should_adjust_dihedral: bool, - allow_nitrile_dihedrals: bool = False - ) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: +def process_chosen_d_indices( + initial_xyz: dict, + base_xyz_indices: dict, + xyz_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, + water: 'ARCSpecies', + zmats_total: List[dict], + is_set_1: bool, + is_set_2: bool, + dihedrals_to_change_num: int, + should_adjust_dihedral: bool, + allow_nitrile_dihedrals: bool = False, +) -> Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]], int]: """ Iterates over the 'd' indices to process TS guess generation. @@ -1214,7 +1264,6 @@ def process_chosen_d_indices(initial_xyz: dict, should_adjust_dihedral (bool): Whether to adjust dihedral angles. allow_nitrile_dihedrals (bool, optional): Force-enable dihedral adjustments for nitriles. Defaults to False. - Returns: Tuple[Dict[str, int], List[Dict[str, Any]], List[Dict[str, Any]]]: - Chosen indices for TS generation. @@ -1224,11 +1273,18 @@ def process_chosen_d_indices(initial_xyz: dict, """ max_dihedrals_found = 0 for d_index in xyz_indices.get("d", []) or [None]: - chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else {**base_xyz_indices, - "d": None} + chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else { + **base_xyz_indices, + "d": None, + } current_zmat, zmat_indices = setup_zmat_indices(initial_xyz, chosen_xyz_indices) - matches = get_matching_dihedrals(current_zmat, zmat_indices['a'], zmat_indices['b'], - zmat_indices['e'], zmat_indices['d']) + matches = get_matching_dihedrals( + current_zmat, + zmat_indices['a'], + zmat_indices['b'], + zmat_indices['e'], + zmat_indices['d'], + ) max_dihedrals_found = max(max_dihedrals_found, len(matches)) if should_adjust_dihedral and dihedrals_to_change_num > len(matches): continue @@ -1246,22 +1302,28 @@ def process_chosen_d_indices(initial_xyz: dict, zmat_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors) if zmat_variants: adjusted_zmats.extend(zmat_variants) - if not adjusted_zmats: - pass - else: + if adjusted_zmats: zmats_to_process = adjusted_zmats ts_guesses_list = [] for zmat_to_process in zmats_to_process: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total) + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, + ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) if attempted_dihedral_adjustments and not ts_guesses_list and ( - reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals): - flipped_zmats= [] + reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals + ): + flipped_zmats = [] adjustment_factors = [15, 25, 35, 45, 55] for indices in indices_list: flipped_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors, flip=True) @@ -1269,8 +1331,14 @@ def process_chosen_d_indices(initial_xyz: dict, for zmat_to_process in flipped_zmats: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) @@ -1311,10 +1379,12 @@ def get_main_reactant_and_water_from_hydrolysis_reaction(reaction: 'ARCReaction' return arc_reactant, water -def get_neighbors_by_electronegativity(spc: 'ARCSpecies', - atom_index: int, - exclude_index: int, - two_neighbors: bool = True) -> Tuple[int, List[int]]: +def get_neighbors_by_electronegativity( + spc: 'ARCSpecies', + atom_index: int, + exclude_index: int, + two_neighbors: bool = True, +) -> Tuple[int, List[int]]: """ Retrieve the top two neighbors of a given atom in a species, sorted by their effective electronegativity, excluding a specified neighbor. @@ -1340,8 +1410,11 @@ def get_neighbors_by_electronegativity(spc: 'ARCSpecies', Raises: ValueError: If the atom has no valid neighbors. """ - neighbors = [neighbor for neighbor in spc.mol.atoms[atom_index].edges.keys() - if spc.mol.atoms.index(neighbor) != exclude_index] + neighbors = [ + neighbor + for neighbor in spc.mol.atoms[atom_index].edges.keys() + if spc.mol.atoms.index(neighbor) != exclude_index + ] if not neighbors: raise ValueError(f"Atom at index {atom_index} has no valid neighbors.") @@ -1355,12 +1428,17 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: float: The total electronegativity of the neighbor """ return sum( - ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order - for n in neighbor.edges.keys() + ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order for n in neighbor.edges.keys() ) - effective_electronegativities = [(ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, - get_neighbor_total_electronegativity(n), n ) for n in neighbors] + effective_electronegativities = [ + ( + ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, + get_neighbor_total_electronegativity(n), + n, + ) + for n in neighbors + ] effective_electronegativities.sort(reverse=True, key=lambda x: (x[0], x[1])) sorted_neighbors = [spc.mol.atoms.index(n[2]) for n in effective_electronegativities] most_electronegative = sorted_neighbors[0] @@ -1368,8 +1446,7 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: return most_electronegative, remaining_neighbors -def setup_zmat_indices(initial_xyz: dict, - xyz_indices: dict) -> Tuple[dict, dict]: +def setup_zmat_indices(initial_xyz: dict, xyz_indices: dict) -> Tuple[dict, dict]: """ Convert XYZ coordinates to Z-matrix format and set up corresponding indices. @@ -1387,26 +1464,28 @@ def setup_zmat_indices(initial_xyz: dict, 'a': key_by_val(initial_zmat.get('map', {}), xyz_indices['a']), 'b': key_by_val(initial_zmat.get('map', {}), xyz_indices['b']), 'e': key_by_val(initial_zmat.get('map', {}), xyz_indices['e']), - 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None + 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None, } return initial_zmat, zmat_indices -def generate_dihedral_variants(zmat: dict, - indices: List[int], - adjustment_factors: List[float], - flip: bool = False, - tolerance_degrees: float = 10.0) -> List[dict]: +def generate_dihedral_variants( + zmat: dict, + indices: List[int], + adjustment_factors: List[float], + flip: bool = False, + tolerance_degrees: float = 10.0, +) -> List[dict]: """ - Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. + Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. This function creates variants of the Z-matrix using different adjustment factors: - 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. - 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid - unstable, boundary configurations. - 3. If `flip=True`, the same procedure is applied starting from a flipped - (180°-shifted) baseline angle. - 4. Each adjusted or flipped variant is deep-copied to ensure independence. + 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. + 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid + unstable, boundary configurations. + 3. If `flip=True`, the same procedure is applied starting from a flipped + (180°-shifted) baseline angle. + 4. Each adjusted or flipped variant is deep-copied to ensure independence. Args: zmat (dict): The initial Z-matrix. @@ -1414,7 +1493,8 @@ def generate_dihedral_variants(zmat: dict, adjustment_factors (List[float], optional): List of factors to try. flip (bool, optional): Whether to start from a flipped (180°) baseline dihedral angle. Defaults to False. - tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. Defaults to 10.0. + tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. + Defaults to 10.0. Returns: List[dict]: List of Z-matrix variants with adjusted dihedral angles. @@ -1440,8 +1520,9 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: seed_value = normalized_value if flip: seed_value = get_angle_in_180_range(normalized_value + 180.0) - boundary_like = ((abs(seed_value) < tolerance_degrees) - or (180 - tolerance_degrees <= abs(seed_value) <= 180+tolerance_degrees)) + boundary_like = (abs(seed_value) < tolerance_degrees) or ( + 180 - tolerance_degrees <= abs(seed_value) <= 180 + tolerance_degrees + ) if boundary_like: for factor in adjustment_factors: variant = copy.deepcopy(zmat) @@ -1450,11 +1531,13 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: return variants -def get_matching_dihedrals(zmat: dict, - a: int, - b: int, - e: int, - d: Optional[int]) -> List[List[int]]: +def get_matching_dihedrals( + zmat: dict, + a: int, + b: int, + e: int, + d: Optional[int], +) -> List[List[int]]: """ Retrieve all dihedral angles in the Z-matrix that match the given atom indices. This function scans the Z-matrix for dihedral parameters (keys starting with 'D_' or 'DX_') @@ -1484,11 +1567,13 @@ def get_matching_dihedrals(zmat: dict, return matches -def stretch_ab_bond(initial_zmat: 'dict', - xyz_indices: 'dict', - zmat_indices: 'dict', - hydrolysis_parameters: 'dict', - reaction_family: str) -> None: +def stretch_ab_bond( + initial_zmat: dict, + xyz_indices: dict, + zmat_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, +) -> None: """ Stretch the bond between atoms a and b in the Z-matrix based on the reaction family parameters. @@ -1519,16 +1604,18 @@ def stretch_ab_bond(initial_zmat: 'dict', stretch_zmat_bond(zmat=initial_zmat, indices=indices, stretch=stretch_degree) -def process_family_specific_adjustments(is_set_1: bool, - is_set_2: bool, - reaction_family: str, - hydrolysis_parameters: dict, - initial_zmat: dict, - water: 'ARCSpecies', - xyz_indices: dict, - zmats_total: List[dict]) -> Tuple[List[dict], List[dict]]: +def process_family_specific_adjustments( + is_set_1: bool, + is_set_2: bool, + reaction_family: str, + hydrolysis_parameters: dict, + initial_zmat: dict, + water: 'ARCSpecies', + xyz_indices: dict, + zmats_total: List[dict], +) -> Tuple[List[dict], List[dict]]: """ - Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses . + Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses. Args: is_set_1 (bool): Whether the reaction belongs to parameter set 1. @@ -1546,38 +1633,52 @@ def process_family_specific_adjustments(is_set_1: bool, Raises: ValueError: If the reaction family is not supported. """ - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices.values() + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices.values() r_atoms = [a_xyz, o_xyz, o_xyz] a_atoms = [[b_xyz, a_xyz], [a_xyz, o_xyz], [h1_xyz, o_xyz]] - d_atoms = ([[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] - if d_xyz is not None else - [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]]) + d_atoms = ( + [[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + if d_xyz is not None + else [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + ) r_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['r_value'] a_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['a_value'] d_values = hydrolysis_parameters['family_parameters'][str(reaction_family)]['d_values'] if is_set_1 or is_set_2: initial_xyz = zmat_to_xyz(initial_zmat) - return generate_hydrolysis_ts_guess(initial_xyz, xyz_indices.values(), water, r_atoms, a_atoms, d_atoms, - r_value, a_value, d_values, zmats_total, is_set_1, - threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8) + return generate_hydrolysis_ts_guess( + initial_xyz, + xyz_indices.values(), + water, + r_atoms, + a_atoms, + d_atoms, + r_value, + a_value, + d_values, + zmats_total, + is_set_1, + threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8, + ) else: raise ValueError(f"Family {reaction_family} not supported for hydrolysis TS guess generation.") -def generate_hydrolysis_ts_guess(initial_xyz: dict, - xyz_indices: List[int], - water: 'ARCSpecies', - r_atoms: List[int], - a_atoms: List[List[int]], - d_atoms: List[List[int]], - r_value: List[float], - a_value: List[float], - d_values: List[List[float]], - zmats_total: List[dict], - is_set_1: bool, - threshold: float - ) -> Tuple[List[dict], List[dict]]: +def generate_hydrolysis_ts_guess( + initial_xyz: dict, + xyz_indices: List[int], + water: 'ARCSpecies', + r_atoms: List[int], + a_atoms: List[List[int]], + d_atoms: List[List[int]], + r_value: List[float], + a_value: List[float], + d_values: List[List[float]], + zmats_total: List[dict], + is_set_1: bool, + threshold: float, +) -> Tuple[List[dict], List[dict]]: """ Generate Z-matrices and Cartesian coordinates for transition state (TS) guesses. @@ -1600,7 +1701,7 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, """ xyz_guesses = [] - for index, d_value in enumerate(d_values): + for d_value in d_values: xyz_guess = copy.deepcopy(initial_xyz) for i in range(3): xyz_guess = add_atom_to_xyz_using_internal_coords( @@ -1611,23 +1712,22 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, d_indices=d_atoms[i], r_value=r_value[i], a_value=a_value[i], - d_value=d_value[i] + d_value=d_value[i], ) - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices - are_valid_bonds=check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz+1, a_xyz, b_xyz]) - colliding=colliding_atoms(xyz_guess, threshold=threshold) + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices + are_valid_bonds = check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz + 1, a_xyz, b_xyz]) + colliding = colliding_atoms(xyz_guess, threshold=threshold) duplicate = any(compare_zmats(existing, xyz_to_zmat(xyz_guess)) for existing in zmats_total) if is_set_1: - dihedral_edao=[e_xyz, d_xyz, a_xyz, o_xyz] - dao_is_linear=check_dao_angle(dihedral_edao, xyz_guess) + dihedral_edao = [e_xyz, d_xyz, a_xyz, o_xyz] + dao_is_linear = check_dao_angle(dihedral_edao, xyz_guess) else: - dao_is_linear=False + dao_is_linear = False if xyz_guess is not None and not colliding and not duplicate and are_valid_bonds and not dao_is_linear: xyz_guesses.append(xyz_guess) zmats_total.append(xyz_to_zmat(xyz_guess)) - return xyz_guesses, zmats_total @@ -1644,7 +1744,7 @@ def check_dao_angle(d_indices: List[int], xyz_guess: dict) -> bool: """ angle_indices = [d_indices[1], d_indices[2], d_indices[3]] angle_value = calculate_angle(xyz_guess, angle_indices) - norm_value=(angle_value + 180) % 180 + norm_value = (angle_value + 180) % 180 return (norm_value < 10) or (norm_value > 170) @@ -1659,7 +1759,7 @@ def check_ts_bonds(transition_state_xyz: dict, tested_atom_indices: list) -> boo Returns: bool: Whether the transition state guess has the expected water-related bonds. """ - oxygen_index, h1_index, h2_index, a_index, b_index= tested_atom_indices + oxygen_index, h1_index, h2_index, a_index, b_index = tested_atom_indices oxygen_bonds = sorted_distances_of_atom(transition_state_xyz, oxygen_index) h1_bonds = sorted_distances_of_atom(transition_state_xyz, h1_index) h2_bonds = sorted_distances_of_atom(transition_state_xyz, h2_index) @@ -1678,10 +1778,12 @@ def check_oxygen_bonds(bonds): return rel_error <= 0.1 return False - oxygen_has_valid_bonds = (oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds)) - h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}and h1_bonds[1][0] in {oxygen_index, b_index}) + oxygen_has_valid_bonds = oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds) + h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}) and ( + h1_bonds[1][0] in {oxygen_index, b_index} + ) h2_has_valid_bonds = h2_bonds[0][0] == oxygen_index return oxygen_has_valid_bonds and h1_has_valid_bonds and h2_has_valid_bonds -register_job_adapter('heuristics', HeuristicsAdapter) +register_job_adapter("heuristics", HeuristicsAdapter) diff --git a/arc/job/adapters/ts/heuristics_test.py b/arc/job/adapters/ts/heuristics_test.py index 250e10d852..fba89e9462 100644 --- a/arc/job/adapters/ts/heuristics_test.py +++ b/arc/job/adapters/ts/heuristics_test.py @@ -10,6 +10,8 @@ import os import shutil import unittest +from types import SimpleNamespace +from unittest.mock import patch from arc.common import ARC_TESTING_PATH, almost_equal_coords from arc.family import get_reaction_family_products @@ -31,6 +33,7 @@ check_dao_angle, check_ts_bonds, ) +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints from arc.reaction import ARCReaction from arc.species.converter import str_to_xyz, zmat_to_xyz, zmat_from_xyz from arc.species.species import ARCSpecies @@ -2258,5 +2261,61 @@ def tearDownClass(cls): shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'heuristics_1'), ignore_errors=True) +class TestHeuristicsHub(unittest.TestCase): + """Unit tests for shared heuristic seed and CREST-constraint helpers.""" + + def test_get_ts_seeds_h_abstraction(self): + rxn = SimpleNamespace(family='H_Abstraction') + with patch('arc.job.adapters.ts.heuristics.h_abstraction', + return_value=[{'xyz': {'symbols': ('H',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (1,)}, + 'method': 'Heuristics'}]): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics', dihedral_increment=60) + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'H_Abstraction') + self.assertEqual(seeds[0]['method'], 'Heuristics') + self.assertEqual(seeds[0]['source_adapter'], 'heuristics') + + def test_get_ts_seeds_hydrolysis(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = {'symbols': ('O',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (16,)} + with patch('arc.job.adapters.ts.heuristics.hydrolysis', + return_value=([xyz], ['carbonyl_based_hydrolysis'], [[0, 1, 2]])): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics') + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'carbonyl_based_hydrolysis') + self.assertEqual(seeds[0]['xyz'], xyz) + self.assertEqual(seeds[0]['metadata'], {'indices': [0, 1, 2]}) + + def test_get_wrapper_constraints_crest(self): + rxn = SimpleNamespace(family='H_Abstraction') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsInstance(atoms, dict) + self.assertSetEqual(set(atoms.keys()), {'A', 'H', 'B'}) + self.assertTrue(all(isinstance(v, int) for v in atoms.values())) + + def test_get_wrapper_constraints_crest_unsupported_family(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsNone(atoms) + + def test_get_ts_seeds_unsupported_adapter(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_ts_seeds(reaction=rxn, base_adapter='gcn') + + def test_get_wrapper_constraints_unsupported_wrapper(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_wrapper_constraints(wrapper='foo_wrapper', reaction=rxn, seed={}) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/ts/seed_hub.py b/arc/job/adapters/ts/seed_hub.py new file mode 100644 index 0000000000..4a38254cdb --- /dev/null +++ b/arc/job/adapters/ts/seed_hub.py @@ -0,0 +1,168 @@ +""" +Shared TS-seed and wrapper-constraint hub. + +This module centralizes: +1. How TS seeds are requested from a base TS-search adapter. +2. How wrapper adapters (e.g., CREST) request family-specific constraints for a seed. +""" + +from typing import Dict, List, Optional + +from arc.common import get_logger +from arc.species.converter import xyz_to_dmat + +logger = get_logger() + + +def get_ts_seeds(reaction: 'ARCReaction', + base_adapter: str = 'heuristics', + dihedral_increment: Optional[int] = None, + ) -> List[dict]: + """ + Return TS seed entries from a base TS-search adapter. + + Seed schema: + - ``xyz`` (dict): Cartesian coordinates. + - ``family`` (str): The family associated with this seed. + - ``method`` (str): Human-readable generator label. + - ``source_adapter`` (str): Adapter id that generated the seed. + - ``metadata`` (dict, optional): Adapter-specific auxiliary fields. + + Args: + reaction: The ARC reaction object. + base_adapter: The underlying TS-search adapter providing seeds. + dihedral_increment: Optional scan increment used by adapters that support it. + """ + adapter = (base_adapter or '').lower() + if adapter != 'heuristics': + raise ValueError(f'Unsupported TS seed base adapter: {base_adapter}') + + # Lazily import to avoid circular imports with heuristics.py. + from arc.job.adapters.ts.heuristics import FAMILY_SETS, h_abstraction, hydrolysis + + xyz_entries = list() + if reaction.family == 'H_Abstraction': + xyzs = h_abstraction(reaction=reaction, dihedral_increment=dihedral_increment) + for entry in xyzs: + xyz = entry.get('xyz') if isinstance(entry, dict) else entry + method = entry.get('method', 'Heuristics') if isinstance(entry, dict) else 'Heuristics' + if xyz is not None: + xyz_entries.append({ + 'xyz': xyz, + 'method': method, + 'family': reaction.family, + 'source_adapter': 'heuristics', + 'metadata': {}, + }) + elif reaction.family in FAMILY_SETS['hydrolysis_set_1'] or reaction.family in FAMILY_SETS['hydrolysis_set_2']: + try: + xyzs_raw, families, indices = hydrolysis(reaction=reaction) + xyz_entries = [{ + 'xyz': xyz, + 'method': 'Heuristics', + 'family': family, + 'source_adapter': 'heuristics', + 'metadata': {'indices': idx}, + } for xyz, family, idx in zip(xyzs_raw, families, indices)] + except ValueError: + xyz_entries = list() + return xyz_entries + + +def get_wrapper_constraints(wrapper: str, + reaction: 'ARCReaction', + seed: dict, + ) -> Optional[dict]: + """ + Return wrapper-specific constraints for a TS seed. + + Args: + wrapper: Wrapper adapter id (e.g., ``crest``). + reaction: The ARC reaction object. + seed: A seed entry returned by :func:`get_ts_seeds`. + """ + wrapper_name = (wrapper or '').lower() + if wrapper_name != 'crest': + raise ValueError(f'Unsupported wrapper adapter: {wrapper}') + return _get_crest_constraints(reaction=reaction, seed=seed) + + +def _get_crest_constraints(reaction: 'ARCReaction', seed: dict) -> Optional[Dict[str, int]]: + """ + Return CREST constraints for a seed. + + Currently, only H_Abstraction is supported. + """ + family = seed.get('family') or reaction.family + xyz = seed.get('xyz') + if family != 'H_Abstraction' or xyz is None: + return None + return _get_h_abs_atoms_from_xyz(xyz) + + +def _get_h_abs_atoms_from_xyz(xyz: dict) -> Optional[Dict[str, int]]: + """ + Determine H-abstraction atoms from a TS guess. + + Returns: + Optional[Dict[str, int]]: ``{'H': int, 'A': int, 'B': int}``, or ``None``. + """ + symbols = xyz.get('symbols') if isinstance(xyz, dict) else None + if not symbols: + return None + dmat = xyz_to_dmat(xyz) + if dmat is None: + return None + + closest_atoms = dict() + for i in range(len(symbols)): + nearest = sorted( + ((dmat[i][j], j) for j in range(len(symbols)) if j != i), + key=lambda x: x[0], + )[:2] + closest_atoms[i] = [idx for _, idx in nearest] + + hydrogen_indices = [i for i, symbol in enumerate(symbols) if symbol.startswith('H')] + condition_occurrences = list() + + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + is_heavy_present = any(not symbols[atom].startswith('H') for atom in atom_neighbors) + if_hydrogen_present = any(symbols[atom].startswith('H') and atom != hydrogen_index for atom in atom_neighbors) + + if is_heavy_present and if_hydrogen_present: + condition_occurrences.append({'H': hydrogen_index, 'A': atom_neighbors[0], 'B': atom_neighbors[1]}) + + if condition_occurrences: + if len(condition_occurrences) > 1: + occurrence_distances = list() + for occurrence in condition_occurrences: + h_atom = occurrence['H'] + a_atom = occurrence['A'] + b_atom = occurrence['B'] + occurrence_distances.append((occurrence, dmat[h_atom][a_atom] + dmat[h_atom][b_atom])) + best_occurrence = min(occurrence_distances, key=lambda x: x[1])[0] + return {'H': best_occurrence['H'], 'A': best_occurrence['A'], 'B': best_occurrence['B']} + single_occurrence = condition_occurrences[0] + return {'H': single_occurrence['H'], 'A': single_occurrence['A'], 'B': single_occurrence['B']} + + min_distance = float('inf') + selected_hydrogen = None + selected_heavy_atoms = None + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + heavy_atoms = [atom for atom in atom_neighbors if not symbols[atom].startswith('H')] + if len(heavy_atoms) < 2: + continue + distances = dmat[hydrogen_index][heavy_atoms[0]] + dmat[hydrogen_index][heavy_atoms[1]] + if distances < min_distance: + min_distance = distances + selected_hydrogen = hydrogen_index + selected_heavy_atoms = heavy_atoms + + if selected_hydrogen is not None and selected_heavy_atoms is not None: + return {'H': selected_hydrogen, 'A': selected_heavy_atoms[0], 'B': selected_heavy_atoms[1]} + + logger.warning('No valid hydrogen atom found for CREST H-abstraction atoms.') + return None + diff --git a/arc/job/pipe/pipe_coordinator.py b/arc/job/pipe/pipe_coordinator.py index bf80a68e3d..7b9ce69bbc 100644 --- a/arc/job/pipe/pipe_coordinator.py +++ b/arc/job/pipe/pipe_coordinator.py @@ -8,18 +8,22 @@ Family-specific task planning lives in ``pipe_planner.py``. """ +import json import os import time -from typing import TYPE_CHECKING, Dict, List +from typing import TYPE_CHECKING, Dict, List, Optional from arc.common import get_logger from arc.imports import settings +from arc.job.pipe.pipe_run import PipeRun, SCHEDULER_VISIBILITY_GRACE, ingest_completed_task +from arc.job.factory import job_factory from arc.job.pipe.pipe_run import PipeRun, ingest_completed_task from arc.job.pipe.pipe_state import ( TASK_FAMILY_TO_JOB_TYPE, PipeRunState, TaskState, TaskSpec, - TaskStateRecord, read_task_state, + TaskStateRecord, get_task_attempt_dir, read_task_state, ) +from arc.level import Level if TYPE_CHECKING: from arc.scheduler import Scheduler @@ -189,7 +193,38 @@ def register_pipe_run_from_dir(self, pipe_root: str) -> PipeRun: self.active_pipes[pipe.run_id] = pipe return pipe - def poll_pipes(self) -> None: + @staticmethod + def _is_scheduler_job_alive(pipe: PipeRun, + server_job_ids: Optional[List[str]], + ) -> bool: + """ + Check whether a pipe run's scheduler job is still in the cluster queue. + + For PBS/Slurm array jobs the stored ``scheduler_job_id`` is the base + ID (e.g. ``'4018898[]'`` for PBS, ``'12345'`` for Slurm), while the + queue lists individual elements (``'4018898[0]'`` for PBS, + ``'12345_7'`` for Slurm). We match on the numeric prefix with both + ``[`` and ``_`` array separators so both formats are recognised. + + Returns True (optimistic) when *server_job_ids* is unavailable. + """ + if server_job_ids is None or pipe.scheduler_job_id is None: + return True # Cannot determine — assume alive. + base = pipe.scheduler_job_id.rstrip('[]') + if any(jid == base + or jid.startswith(base + '[') + or jid.startswith(base + '_') + for jid in server_job_ids): + return True + # Grace period: the scheduler snapshot may not yet list a just-submitted + # job (array jobs can take seconds to surface in qstat, and a fresh + # get_server_job_ids() call drops any append made by submit). + if pipe.submitted_at is not None \ + and time.time() - pipe.submitted_at < SCHEDULER_VISIBILITY_GRACE: + return True + return False + + def poll_pipes(self, server_job_ids: Optional[List[str]] = None) -> None: """ Reconcile all active pipe runs. @@ -198,12 +233,19 @@ def poll_pipes(self) -> None: Tolerates up to 3 consecutive reconciliation failures per run before marking it as FAILED and removing it. + + Args: + server_job_ids: Job IDs currently present in the cluster queue + (from ``check_running_jobs_ids``). Used to detect when a + pipe's scheduler job has left the queue so that orphaned + tasks can be cleaned up immediately. """ max_consecutive_failures = 3 for run_id in list(self.active_pipes.keys()): pipe = self.active_pipes[run_id] + job_alive = self._is_scheduler_job_alive(pipe, server_job_ids) try: - counts = pipe.reconcile() + counts = pipe.reconcile(scheduler_job_alive=job_alive) except Exception: n_failures = self._pipe_poll_failures.get(run_id, 0) + 1 self._pipe_poll_failures[run_id] = n_failures @@ -370,6 +412,19 @@ def _post_ingest_conf_sp(self, label: str) -> None: else: self.sched.run_composite_job(label) + def _read_task_parser_summary(self, pipe_root: str, task_id: str, attempt_index: int) -> Dict: + """Read parser_summary from a worker-written result.json for a failed task attempt.""" + result_path = os.path.join(get_task_attempt_dir(pipe_root, task_id, attempt_index), 'result.json') + if not os.path.isfile(result_path): + return {} + try: + with open(result_path, 'r') as f: + result_data = json.load(f) + except (OSError, json.JSONDecodeError): + return {} + parser_summary = result_data.get('parser_summary') + return parser_summary if isinstance(parser_summary, dict) else {} + def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, state: 'TaskStateRecord') -> None: """ @@ -402,6 +457,7 @@ def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, return payload = spec.input_payload or {} meta = spec.ingestion_metadata or {} + parser_summary = self._read_task_parser_summary(pipe.pipe_root, spec.task_id, state.attempt_index) kwargs = { 'job_type': job_type, 'label': label, @@ -409,6 +465,8 @@ def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, 'job_adapter': spec.engine, 'xyz': payload.get('xyz'), 'conformer': meta.get('conformer_index'), + 'cpu_cores': spec.required_cores, + 'memory': spec.required_memory_mb / 1024.0, } if spec.task_family == 'irc': kwargs['irc_direction'] = meta.get('irc_direction') @@ -416,9 +474,45 @@ def _eject_to_scheduler(self, pipe: 'PipeRun', spec: TaskSpec, kwargs['rotor_index'] = meta.get('rotor_index') kwargs['torsions'] = payload.get('torsions') try: - logger.info(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' - f'ejecting to Scheduler as individual {job_type} job for {label}.') - self.sched.run_job(**kwargs) + if parser_summary: + job = job_factory( + job_adapter=spec.engine, + project=self.sched.project, + project_directory=self.sched.project_directory, + job_type=job_type, + level=Level(repr=spec.level), + ess_settings=self.sched.ess_settings, + species=[self.sched.species_dict[label]], + xyz=payload.get('xyz'), + conformer=meta.get('conformer_index'), + cpu_cores=spec.required_cores, + job_memory_gb=spec.required_memory_mb / 1024.0, + ess_trsh_methods=list(), + execution_type='queue', + irc_direction=meta.get('irc_direction'), + rotor_index=meta.get('rotor_index'), + torsions=payload.get('torsions'), + args=spec.args, + testing=getattr(self.sched, 'testing', False), + ) + parser_summary.setdefault('status', 'errored') + parser_summary.setdefault('keywords', list()) + parser_summary.setdefault('error', '') + parser_summary.setdefault('line', '') + job.job_status = ['done', parser_summary] + logger.info(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'ejecting to Scheduler for immediate troubleshooting as individual ' + f'{job_type} job for {label}.') + self.sched.troubleshoot_ess( + label=label, + job=job, + level_of_theory=Level(repr=spec.level), + conformer=meta.get('conformer_index'), + ) + else: + logger.info(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' + f'ejecting to Scheduler as individual {job_type} job for {label}.') + self.sched.run_job(**kwargs) except Exception: logger.error(f'Pipe run {pipe.run_id}, task {spec.task_id}: ' f'failed to eject to Scheduler.', exc_info=True) diff --git a/arc/job/pipe/pipe_coordinator_test.py b/arc/job/pipe/pipe_coordinator_test.py index 087878a5a8..9920fa5652 100644 --- a/arc/job/pipe/pipe_coordinator_test.py +++ b/arc/job/pipe/pipe_coordinator_test.py @@ -5,6 +5,7 @@ This module contains unit tests for the arc.job.pipe.pipe_coordinator module """ +import json import os import shutil import tempfile @@ -66,7 +67,10 @@ def _make_spec(task_id, task_family='conf_opt', engine='mockter', level=None, def _make_mock_sched(project_directory): """Create a mock Scheduler with the attributes PipeCoordinator needs.""" sched = MagicMock() + sched.project = 'pipe_test_project' sched.project_directory = project_directory + sched.ess_settings = {'orca': ['local'], 'mockter': ['local']} + sched.testing = True sched.server_job_ids = list() spc = ARCSpecies(label='H2O', smiles='O') spc.conformers = [None] * 5 @@ -245,6 +249,80 @@ def fake_reconcile(): self.assertIn('77777[]', self.coord.sched.server_job_ids) +class TestIsSchedulerJobAlive(unittest.TestCase): + """Tests for PipeCoordinator._is_scheduler_job_alive().""" + + def test_none_server_ids_returns_true(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345[]' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, None)) + + def test_none_scheduler_job_id_returns_true(self): + pipe = MagicMock() + pipe.scheduler_job_id = None + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345[0]'])) + + def test_pbs_array_element_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '4018898[]' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['4018898[2]', '9999'])) + + def test_pbs_array_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '4018898[]' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['9999', '5555'])) + + def test_non_array_job_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345', '9999'])) + + def test_non_array_job_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['9999'])) + + def test_empty_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345[]' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, [])) + + def test_slurm_array_element_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertTrue(PipeCoordinator._is_scheduler_job_alive(pipe, ['12345_7', '9999'])) + + def test_slurm_array_not_in_queue(self): + pipe = MagicMock() + pipe.scheduler_job_id = '12345' + self.assertFalse(PipeCoordinator._is_scheduler_job_alive(pipe, ['99999_7'])) + + +class TestPollPipesJobGone(unittest.TestCase): + """Test that poll_pipes passes server_job_ids to reconcile for orphan detection.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix='pipe_coord_gone_') + self.coord = PipeCoordinator(_make_mock_sched(self.tmpdir)) + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_poll_with_server_job_ids_cleans_stuck_pipe(self): + """A pipe whose scheduler job left the queue gets cleaned up.""" + pipe = self.coord.submit_pipe_run('run_stuck', [_make_spec('t0')]) + pipe.scheduler_job_id = '99999[]' + now = time.time() + update_task_state(pipe.pipe_root, 't0', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 86400) + update_task_state(pipe.pipe_root, 't0', new_status=TaskState.RUNNING, started_at=now) + # Job 99999 is NOT in the queue — pass empty list. + self.coord.poll_pipes(server_job_ids=[]) + # The pipe should have been cleaned up (orphaned → failed_terminal → ingested). + self.assertNotIn('run_stuck', self.coord.active_pipes) + + class TestIngestPipeResults(unittest.TestCase): """Tests for PipeCoordinator.ingest_pipe_results().""" @@ -274,6 +352,62 @@ def test_ingest_skips_unreadable_state(self): os.remove(os.path.join(pipe.pipe_root, 'tasks', 't_missing', 'state.json')) self.coord.ingest_pipe_results(pipe) # should not raise + @patch('arc.job.pipe.pipe_coordinator.job_factory') + def test_failed_ess_ejects_to_scheduler_troubleshooting_with_parser_summary(self, mock_job_factory): + """FAILED_ESS tasks with parser_summary should be troubleshooted immediately, not blindly rerun.""" + task = _make_spec('t_ess', task_family='species_sp', engine='orca', cores=12, mem=37888) + pipe = self.coord.submit_pipe_run('run_ess', [task]) + now = time.time() + update_task_state(pipe.pipe_root, 't_ess', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe.pipe_root, 't_ess', new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe.pipe_root, 't_ess', new_status=TaskState.FAILED_ESS, + ended_at=now + 1, failure_class='ess_error') + result_path = os.path.join(pipe.pipe_root, 'tasks', 't_ess', 'attempts', '0', 'result.json') + os.makedirs(os.path.dirname(result_path), exist_ok=True) + with open(result_path, 'w') as f: + json.dump({'parser_summary': {'status': 'errored', + 'keywords': ['MDCI', 'Memory', 'max_total_job_memory'], + 'error': 'Orca suggests to increase per cpu core memory to 10218 MB.', + 'line': 'Please increase MaxCore'}}, f) + + fake_job = MagicMock() + mock_job_factory.return_value = fake_job + + self.coord.ingest_pipe_results(pipe) + + self.coord.sched.troubleshoot_ess.assert_called_once() + self.coord.sched.run_job.assert_not_called() + kwargs = self.coord.sched.troubleshoot_ess.call_args.kwargs + self.assertEqual(kwargs['label'], 'H2O') + self.assertIs(kwargs['job'], fake_job) + self.assertEqual(kwargs['conformer'], 0) + mock_job_factory.assert_called_once() + factory_kwargs = mock_job_factory.call_args.kwargs + self.assertEqual(factory_kwargs['cpu_cores'], 12) + self.assertAlmostEqual(factory_kwargs['job_memory_gb'], 37.0) + + def test_failed_ess_without_parser_summary_preserves_resources_on_rerun(self): + """FAILED_ESS tasks without parser_summary fall back to Scheduler.run_job with original resources.""" + task = _make_spec('t_ess_fallback', task_family='species_sp', engine='orca', cores=10, mem=20480) + pipe = self.coord.submit_pipe_run('run_ess_fallback', [task]) + now = time.time() + update_task_state(pipe.pipe_root, 't_ess_fallback', new_status=TaskState.CLAIMED, + claimed_by='w', claim_token='tok', + claimed_at=now, lease_expires_at=now + 300) + update_task_state(pipe.pipe_root, 't_ess_fallback', new_status=TaskState.RUNNING, started_at=now) + update_task_state(pipe.pipe_root, 't_ess_fallback', new_status=TaskState.FAILED_ESS, + ended_at=now + 1, failure_class='ess_error') + + self.coord.ingest_pipe_results(pipe) + + self.coord.sched.troubleshoot_ess.assert_not_called() + self.coord.sched.run_job.assert_called_once() + kwargs = self.coord.sched.run_job.call_args.kwargs + self.assertEqual(kwargs['cpu_cores'], 10) + self.assertAlmostEqual(kwargs['memory'], 20.0) + class TestComputePipeRoot(unittest.TestCase): """Tests for PipeCoordinator._compute_pipe_root().""" diff --git a/arc/job/pipe/pipe_run.py b/arc/job/pipe/pipe_run.py index 6577b11f35..1974a69a73 100644 --- a/arc/job/pipe/pipe_run.py +++ b/arc/job/pipe/pipe_run.py @@ -36,6 +36,7 @@ logger = get_logger() RESUBMIT_GRACE = 120 # seconds – grace period after resubmission before flagging again +SCHEDULER_VISIBILITY_GRACE = 60 # seconds – trust a just-submitted pipe is alive even if the scheduler snapshot doesn't yet list it pipe_settings = settings['pipe_settings'] default_job_settings = settings['default_job_settings'] @@ -279,11 +280,17 @@ def submit_to_scheduler(self): ) return job_status, job_id - def reconcile(self) -> Dict[str, int]: + def reconcile(self, scheduler_job_alive: bool = True) -> Dict[str, int]: """ Poll all tasks, detect orphans, schedule retries, and check for completion. Does not regress an already-terminal run status. + Args: + scheduler_job_alive: Whether the scheduler (PBS/Slurm) job for this + pipe run is still present in the queue. When False, any + CLAIMED/RUNNING tasks are immediately orphaned (the workers + are gone) and unreachable PENDING tasks are failed terminally. + Returns: Dict[str, int]: Counts of tasks in each state. """ @@ -310,9 +317,11 @@ def reconcile(self) -> Dict[str, int]: except (FileNotFoundError, ValueError, KeyError): continue current = TaskState(state.status) + # Orphan detection: lease expired OR the scheduler job is gone. if current in (TaskState.CLAIMED, TaskState.RUNNING) \ - and state.lease_expires_at is not None \ - and now > state.lease_expires_at: + and (not scheduler_job_alive + or (state.lease_expires_at is not None + and now > state.lease_expires_at)): try: update_task_state(self.pipe_root, task_id, new_status=TaskState.ORPHANED, @@ -347,7 +356,9 @@ def reconcile(self) -> Dict[str, int]: try: # FAILED_ESS tasks are handled separately (ejected to Scheduler). # Only FAILED_RETRYABLE and ORPHANED reach here. - if state.attempt_index + 1 < state.max_attempts: + # Only retry if the scheduler job is alive (workers exist + # to claim the retried task); otherwise fail terminally. + if state.attempt_index + 1 < state.max_attempts and scheduler_job_alive: update_task_state(self.pipe_root, task_id, new_status=TaskState.PENDING, attempt_index=state.attempt_index + 1, @@ -389,6 +400,30 @@ def reconcile(self) -> Dict[str, int]: f'scheduler workers still starting, skipping resubmission.') self._needs_resubmission = False + # If the scheduler job is gone, any PENDING tasks will never be + # claimed. Mark them as terminally failed so the pipe can finish. + if not scheduler_job_alive and counts[TaskState.PENDING.value] > 0: + for task_id in task_ids: + if not os.path.isdir(os.path.join(tasks_dir, task_id)): + continue + try: + state = read_task_state(self.pipe_root, task_id) + except (FileNotFoundError, ValueError, KeyError): + continue + if TaskState(state.status) != TaskState.PENDING: + continue + try: + update_task_state(self.pipe_root, task_id, + new_status=TaskState.FAILED_TERMINAL, + ended_at=now) + counts[TaskState.PENDING.value] -= 1 + counts[TaskState.FAILED_TERMINAL.value] += 1 + logger.info(f'Task {task_id}: no workers remain ' + f'(scheduler job gone). Marked FAILED_TERMINAL.') + except (ValueError, TimeoutError) as e: + logger.warning(f'Task {task_id}: failed to mark stranded pending task ' + f'during reconciliation: {e}') + terminal = (counts[TaskState.COMPLETED.value] + counts[TaskState.FAILED_ESS.value] + counts[TaskState.FAILED_TERMINAL.value] diff --git a/arc/job/pipe/pipe_state.py b/arc/job/pipe/pipe_state.py index 0de4a78808..ee6f4d372c 100644 --- a/arc/job/pipe/pipe_state.py +++ b/arc/job/pipe/pipe_state.py @@ -110,7 +110,7 @@ class PipeRunState(str, Enum): # Allowed transitions: maps each state to the set of states it may transition to. TASK_TRANSITIONS: Dict[TaskState, Tuple[TaskState, ...]] = { - TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED), + TaskState.PENDING: (TaskState.CLAIMED, TaskState.CANCELLED, TaskState.FAILED_TERMINAL), TaskState.CLAIMED: (TaskState.RUNNING, TaskState.ORPHANED, TaskState.CANCELLED), TaskState.RUNNING: (TaskState.COMPLETED, TaskState.FAILED_RETRYABLE, TaskState.FAILED_ESS, TaskState.FAILED_TERMINAL, TaskState.ORPHANED, TaskState.CANCELLED), diff --git a/arc/job/trsh.py b/arc/job/trsh.py index f1878a7011..2c27006785 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -388,6 +388,12 @@ def determine_ess_status(output_path: str, keywords.append('GTOInt') keywords.append('Memory') break + elif 'F12-CC method not implemented for UHF references' in line: + # ORCA 5.x rejects non-RI F12-CC on open-shell species; the /RI variant works. + keywords = ['F12_UHF'] + error = ('ORCA does not support non-RI F12-CC methods on UHF references. ' + 'Retrying with /RI appended to the method.') + break if done: return 'done', keywords, '', '' error = error if error else 'Orca job terminated for an unknown reason.' @@ -1018,48 +1024,41 @@ def trsh_ess_job(label: str, couldnt_trsh = True elif 'orca' in software: - if 'dlpno' in level_of_theory.method and (is_monoatomic or is_h): - raise TrshError(f'DLPNO methods are incompatible with monoatomic species {label} in Orca. ' + if 'dlpno' in level_of_theory.method and is_h: + raise TrshError(f'DLPNO methods are incompatible with single-electron species {label} in Orca. ' f'This should have been caught by the Scheduler before job submission.') elif 'Memory' in job_status['keywords']: - # Increase memory allocation. - # job_status will be for example - # `Error (ORCA_SCF): Not enough memory available! Please increase MaxCore to more than: 289 MB`. + # ORCA memory troubleshooting keeps the total job memory fixed and + # reduces cpu cores so %%maxcore increases on the rerun. if 'memory' not in ess_trsh_methods: ess_trsh_methods.append('memory') + original_cpu_cores = cpu_cores + total_memory_mb = math.ceil(memory_gb * 1024) try: # parse Orca's memory requirement in MB estimated_mem_per_core = float(job_status['error'].split()[-2]) + # round up to the next hundred + estimated_mem_per_core = int(np.ceil(estimated_mem_per_core / 100.0)) * 100 + cpu_cores = math.floor(total_memory_mb / estimated_mem_per_core) except ValueError: - estimated_mem_per_core = estimate_orca_mem_cpu_requirement(num_heavy_atoms=num_heavy_atoms, - server=server, - consider_server_limits=True)[1]/cpu_cores - # round up to the next hundred - estimated_mem_per_core = int(np.ceil(estimated_mem_per_core / 100.0)) * 100 - if 'max_total_job_memory' in job_status['keywords']: - per_cpu_core_memory = np.ceil(memory_gb / cpu_cores * 1024) - logger.info(f'The crashed Orca job {label} was ran with {cpu_cores} cpu cores and ' - f'{per_cpu_core_memory} MB memory per cpu core. It requires at least ' - f'{estimated_mem_per_core} MB per cpu core. Since the job had already requested the ' - f'maximum amount of available total node memory, ARC will attempt to reduce the number ' - f'of cpu cores to increase memory per cpu core.') - if 'cpu' not in ess_trsh_methods: - ess_trsh_methods.append('cpu') - cpu_cores = math.floor(cpu_cores * per_cpu_core_memory / estimated_mem_per_core) - 2 # be conservative - if cpu_cores > 1: - logger.info(f'Troubleshooting job {label} using {cpu_cores} cpu cores.') - elif cpu_cores == 1: # last resort - logger.info(f'Troubleshooting job {label} using only {cpu_cores} cpu core. Notice that the ' - f'required job time may be unrealistically long or exceed limits on servers.') - else: + # Old ORCA messages do not provide a numeric target. Step cores down + # so %%maxcore increases on each retry instead of resubmitting the same input. + cpu_cores = original_cpu_cores - 2 if original_cpu_cores > 2 else original_cpu_cores - 1 + + if not couldnt_trsh: + if cpu_cores >= original_cpu_cores: + cpu_cores = original_cpu_cores - 2 if original_cpu_cores > 2 else original_cpu_cores - 1 + if cpu_cores < 1: logger.info(f'Not enough computational resource to accomplish job {label}. Please consider cheaper ' f'methods or allocate more resources if possible.') couldnt_trsh = True - if not couldnt_trsh: - memory = estimated_mem_per_core * cpu_cores # total memory for all cpu cores - memory = np.ceil(memory / 1024 + 5) # convert MB to GB, add 5 extra GB (be conservative) - logger.info(f'Troubleshooting {job_type} job in {software} for {label} using {memory} GB total memory ' - f'and {cpu_cores} cpu cores.') + memory = memory_gb + if not couldnt_trsh: + if cpu_cores != original_cpu_cores and 'cpu' not in ess_trsh_methods: + ess_trsh_methods.append('cpu') + per_cpu_core_memory = np.ceil(memory * 1024 / cpu_cores) + logger.info(f'Troubleshooting {job_type} job in {software} for {label} using {memory} GB total memory ' + f'and {cpu_cores} cpu cores ({per_cpu_core_memory:.0f} MB per cpu core).') elif 'cpu' in job_status['keywords']: # Reduce cpu allocation. try: @@ -1072,6 +1071,16 @@ def trsh_ess_job(label: str, logger.info(f'Troubleshooting {job_type} job in {software} for {label} using {cpu_cores} cpu cores.') if 'cpu' not in ess_trsh_methods: ess_trsh_methods.append('cpu') + elif 'F12_UHF' in job_status['keywords'] and 'f12_ri' not in ess_trsh_methods: + # Non-RI F12-CC on UHF: append '/RI' to the method and retry this job only. + level_dict = level_of_theory.as_dict() + level_dict.pop('method_type', None) # re-deduce after method change + level_dict['method'] = f"{level_of_theory.method}/ri" + level_of_theory = Level(repr=level_dict) + ess_trsh_methods.append('f12_ri') + logger.info(f"Troubleshooting {job_type} job in {software} for {label}: appending '/RI' to the " + f"method (non-RI F12-CC is unsupported for UHF references in this ORCA version). " + f"Retrying with {level_of_theory.method}.") else: couldnt_trsh = True diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py index d974874e9c..20e6a3e944 100644 --- a/arc/job/trsh_test.py +++ b/arc/job/trsh_test.py @@ -677,7 +677,7 @@ def test_trsh_ess_job(self): # Test Orca # Orca: test 1 # Test troubleshooting insufficient memory issue - # Automatically increase memory provided not exceeding maximum available memory + # Keep total memory fixed and reduce cpu cores so %%maxcore increases label = 'test' level_of_theory = {'method': 'dlpno-ccsd(T)'} server = 'server1' @@ -697,8 +697,8 @@ def test_trsh_ess_job(self): job_type, software, fine, memory_gb, num_heavy_atoms, cpu_cores, ess_trsh_methods) self.assertIn('memory', ess_trsh_methods) - self.assertEqual(cpu_cores, 32) - self.assertAlmostEqual(memory, 327) + self.assertEqual(cpu_cores, 24) + self.assertAlmostEqual(memory, 250) # Orca: test 2 # Test troubleshooting insufficient memory issue @@ -723,8 +723,8 @@ def test_trsh_ess_job(self): job_type, software, fine, memory_gb, num_heavy_atoms, cpu_cores, ess_trsh_methods) self.assertIn('memory', ess_trsh_methods) - self.assertEqual(cpu_cores, 22) - self.assertAlmostEqual(memory, 227) + self.assertEqual(cpu_cores, 24) + self.assertAlmostEqual(memory, 250) # Orca: test 3 # Test troubleshooting insufficient memory issue @@ -753,6 +753,28 @@ def test_trsh_ess_job(self): self.assertLess(cpu_cores, 1) # can't really run job with less than 1 cpu ^o^ # Orca: test 4 + # Test troubleshooting a capped-memory boundary where 1 cpu core is still viable + label = 'test' + level_of_theory = {'method': 'dlpno-ccsd(T)'} + server = 'server1' + job_type = 'sp' + software = 'orca' + fine = True + memory_gb = 10 + cpu_cores = 2 + num_heavy_atoms = 2 + ess_trsh_methods = ['memory'] + job_status = {'keywords': ['MDCI', 'Memory', 'max_total_job_memory'], + 'error': 'Orca suggests to increase per cpu core memory to 10000 MB.'} + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status, + job_type, software, fine, memory_gb, + num_heavy_atoms, cpu_cores, ess_trsh_methods) + self.assertEqual(couldnt_trsh, False) + self.assertEqual(cpu_cores, 1) + self.assertEqual(memory, 10) + + # Orca: test 5 # Test troubleshooting too many cpu cores # Automatically reduce cpu cores label = 'test' @@ -776,7 +798,95 @@ def test_trsh_ess_job(self): self.assertIn('cpu', ess_trsh_methods) self.assertEqual(cpu_cores, 10) - # Orca: test 5 + # Orca: test 6 + # Test troubleshooting old Orca memory message without a numeric MaxCore suggestion + label = 'test' + level_of_theory = {'method': 'dlpno-ccsd(T)'} + server = 'server1' + job_type = 'sp' + software = 'orca' + fine = True + memory_gb = 16 + cpu_cores = 8 + num_heavy_atoms = 2 + ess_trsh_methods = ['memory'] + job_status = {'keywords': ['MDCI', 'Memory'], + 'error': 'Insufficient job memory.'} + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status, + job_type, software, fine, memory_gb, + num_heavy_atoms, cpu_cores, ess_trsh_methods) + self.assertEqual(couldnt_trsh, False) + self.assertEqual(cpu_cores, 6) + self.assertEqual(memory, 16) + + # Orca: test 7 + # Test troubleshooting generic Orca memory errors without resubmitting identical resources + label = 'test' + level_of_theory = {'method': 'dlpno-ccsd(T)'} + server = 'server1' + job_type = 'sp' + software = 'orca' + fine = True + memory_gb = 25 + cpu_cores = 8 + num_heavy_atoms = 2 + ess_trsh_methods = ['memory'] + job_status = {'keywords': ['MDCI', 'Memory'], + 'error': 'Insufficient job memory.'} + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status, + job_type, software, fine, memory_gb, + num_heavy_atoms, cpu_cores, ess_trsh_methods) + self.assertEqual(couldnt_trsh, False) + self.assertEqual(cpu_cores, 6) + self.assertEqual(memory, 25) + + # Orca: test 8 + # Test stepping from 2 cpu cores down to 1 for generic Orca memory errors + label = 'test' + level_of_theory = {'method': 'dlpno-ccsd(T)'} + server = 'server1' + job_type = 'sp' + software = 'orca' + fine = True + memory_gb = 25 + cpu_cores = 2 + num_heavy_atoms = 2 + ess_trsh_methods = ['memory'] + job_status = {'keywords': ['MDCI', 'Memory'], + 'error': 'Insufficient job memory.'} + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status, + job_type, software, fine, memory_gb, + num_heavy_atoms, cpu_cores, ess_trsh_methods) + self.assertEqual(couldnt_trsh, False) + self.assertEqual(cpu_cores, 1) + self.assertEqual(memory, 25) + + # Orca: test 9 + # Test troubleshooting old Orca cpu-limit message without a numeric pair count + label = 'test' + level_of_theory = {'method': 'dlpno-ccsd(T)'} + server = 'server1' + job_type = 'sp' + software = 'orca' + fine = True + memory_gb = 16 + cpu_cores = 16 + num_heavy_atoms = 2 + ess_trsh_methods = ['cpu'] + job_status = {'keywords': ['MDCI', 'cpu'], + 'error': 'Orca cannot utilize cpu cores more than electron pairs in a molecule. ARC will ' + 'estimate the number of cpu cores needed based on the number of heavy atoms in the ' + 'molecule.'} + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status, + job_type, software, fine, memory_gb, + num_heavy_atoms, cpu_cores, ess_trsh_methods) + self.assertEqual(cpu_cores, 10) + + # Orca: test 10 # Test that DLPNO + monoatomic species raises TrshError label = 'H' level_of_theory = {'method': 'dlpno-ccsd(T)'} diff --git a/arc/mapping/driver.py b/arc/mapping/driver.py index 64fa5fb899..0ebb9de3ee 100644 --- a/arc/mapping/driver.py +++ b/arc/mapping/driver.py @@ -304,6 +304,12 @@ def map_rxn(rxn: 'ARCReaction', return None fragment_maps = map_pairs(pairs) + if any(m is None for m in fragment_maps): + logger.error(f'Could not map all scissored pairs for reaction {rxn} ({rxn.family}); ' + f'{sum(1 for m in fragment_maps if m is None)}/{len(fragment_maps)} pair(s) failed.') + if rxn.product_dicts is not None and len(rxn.product_dicts) - 1 > pdi < MAX_PDI: + return map_rxn(rxn, backend=backend, product_dict_index_to_try=pdi + 1) + return None total_atoms = sum(len(sp.mol.atoms) for sp in reactants) atom_map = glue_maps(maps=fragment_maps, pairs=pairs, diff --git a/arc/mapping/engine.py b/arc/mapping/engine.py index be4dd7f815..f5a0354803 100644 --- a/arc/mapping/engine.py +++ b/arc/mapping/engine.py @@ -1165,21 +1165,31 @@ def update_xyz(species: List[ARCSpecies]) -> List[ARCSpecies]: return new -def r_cut_p_cut_isomorphic(reactant: ARCSpecies, product_: ARCSpecies) -> bool: +def r_cut_p_cut_isomorphic(reactant: ARCSpecies, product_: ARCSpecies, strict: bool = False) -> bool: """ A function for checking if the reactant and product are the same molecule. Args: reactant (ARCSpecies): An ARCSpecies. might be as a result of scissors() product_ (ARCSpecies): an ARCSpecies. might be as a result of scissors() + strict (bool, optional): When ``True``, require full graph isomorphism. When ``False`` (default), + accept either fingerprint (formula) match or graph isomorphism — the looser criterion lets + downstream ``map_two_species`` recover atom correspondence in rearrangements whose cut + fragments are not strictly isomorphic. Strict mode is used as a first pass in + ``pairing_reactants_and_products_for_mapping`` to avoid pairing constitutional isomers + that share a molecular formula but differ in graph structure. Returns: bool: ``True`` if they are isomorphic, ``False`` otherwise. """ res1 = generate_resonance_structures_safely(reactant.mol, save_order=True) for res in res1: - if res.fingerprint == product_.mol.fingerprint or product_.mol.is_isomorphic(res, save_order=True): - return True + if strict: + if product_.mol.is_isomorphic(res, save_order=True): + return True + else: + if res.fingerprint == product_.mol.fingerprint or product_.mol.is_isomorphic(res, save_order=True): + return True return False @@ -1189,6 +1199,12 @@ def pairing_reactants_and_products_for_mapping(r_cuts: List[ARCSpecies], """ A function for matching reactants and products in scissored products. + Greedy two-pass pairing: + 1) Strict graph isomorphism — avoids pairing constitutional isomers that merely share a + molecular formula (e.g. α- vs β-radical positional isomers in H-abstraction). + 2) Loose fingerprint-or-isomorphic fallback — for rearrangements whose cut fragments are + not strictly isomorphic but can still be aligned by ``map_two_species``. + Args: r_cuts (List[ARCSpecies]): A list of the scissored species in the reactants p_cuts (List[ARCSpecies]): A list of the scissored species in the reactants @@ -1197,7 +1213,16 @@ def pairing_reactants_and_products_for_mapping(r_cuts: List[ARCSpecies], List[Tuple[ARCSpecies,ARCSpecies]]: A list of paired reactant and products, to be sent to map_two_species. """ pairs: List[Tuple[ARCSpecies, ARCSpecies]] = list() + unmatched_r: List[ARCSpecies] = list() for react in r_cuts: + for idx, prod in enumerate(p_cuts): + if r_cut_p_cut_isomorphic(react, prod, strict=True): + pairs.append((react, prod)) + p_cuts.pop(idx) + break + else: + unmatched_r.append(react) + for react in unmatched_r: for idx, prod in enumerate(p_cuts): if r_cut_p_cut_isomorphic(react, prod): pairs.append((react, prod)) @@ -1236,28 +1261,34 @@ def label_species_atoms(species: List['ARCSpecies']) -> None: index += 1 -def glue_maps(maps: List[List[int]], +def glue_maps(maps: List[Optional[List[int]]], pairs: List[Tuple[ARCSpecies, ARCSpecies]], r_label_map: Dict[str, int], p_label_map: Dict[str, int], total_atoms: int, - ) -> List[int]: + ) -> Optional[List[int]]: """ Join the maps from the parts of a bimolecular reaction. Args: - maps (List[List[int]]): The list of all maps of the isomorphic cuts. + maps (List[Optional[List[int]]]): The per-pair maps of the isomorphic cuts. An entry may + be ``None`` when ``map_two_species`` could not produce a map for that pair; in that + case ``glue_maps`` aborts and returns ``None`` so the caller can fall back. pairs (List[Tuple[ARCSpecies, ARCSpecies]]): The pairs of the reactants and products. r_label_map (Dict[str, int]): A dictionary mapping the reactant labels to their indices. p_label_map (Dict[str, int]): A dictionary mapping the product labels to their indices. total_atoms (int): The total number of atoms across all reactants. Returns: - List[int]: An Atom Map of the complete reaction. + Optional[List[int]]: The complete atom map, or ``None`` if any per-pair map is ``None``. """ # 1) Build base map am_dict: Dict[int,int] = {} for map_list, (r_cut, p_cut) in zip(maps, pairs): + if map_list is None: + logger.warning(f'glue_maps: received a None per-pair map for ' + f'{r_cut.mol.smiles} -> {p_cut.mol.smiles}; cannot build atom map.') + return None for local_i, r_atom in enumerate(r_cut.mol.atoms): r_glob = int(r_atom.label) p_glob = int(p_cut.mol.atoms[map_list[local_i]].label) diff --git a/arc/mapping/engine_test.py b/arc/mapping/engine_test.py index a4810b9636..93d42992ef 100644 --- a/arc/mapping/engine_test.py +++ b/arc/mapping/engine_test.py @@ -1576,6 +1576,33 @@ def test_r_cut_p_cut_isomorphic(self): self.assertTrue(engine.r_cut_p_cut_isomorphic(ARCSpecies(label="r1", smiles="F[C]F", multiplicity=1), ARCSpecies(label="r1", smiles="F[C]F", multiplicity=3))) + def test_r_cut_p_cut_isomorphic_strict(self): + """Strict mode rejects constitutional isomers that share a molecular formula.""" + alpha = ARCSpecies(label='alpha', smiles='C[CH]OCCC') # α-radical + beta = ARCSpecies(label='beta', smiles='CC[CH]OCC') # β-radical + self.assertTrue(engine.r_cut_p_cut_isomorphic(alpha, beta, strict=False)) + self.assertFalse(engine.r_cut_p_cut_isomorphic(alpha, beta, strict=True)) + same_a = ARCSpecies(label='a', smiles='CC[CH]OCC') + same_b = ARCSpecies(label='b', smiles='CC[CH]OCC') + self.assertTrue(engine.r_cut_p_cut_isomorphic(same_a, same_b, strict=True)) + + def test_pairing_prefers_strict_match_for_formula_isomers(self): + """ + H-abstraction where abstractor = same species on both sides and the two + radicals on the radical side are α/β positional isomers of the same + skeleton. Strict-first pairing must match intact radicals with their + isomorphic cut-fragment counterparts, not with the wrong intact radical. + """ + r_1 = ARCSpecies(label='r1', smiles='CC[CH]OCC') + r_2 = ARCSpecies(label='r2', smiles='CCCOCC') + p_1 = ARCSpecies(label='p1', smiles='C[CH]OCCC') + p_2 = ARCSpecies(label='p2', smiles='CCCOCC') + rxn = ARCReaction(r_species=[r_1, r_2], p_species=[p_1, p_2]) + self.assertEqual(rxn.family, 'H_Abstraction') + atom_map = rxn.atom_map + self.assertIsNotNone(atom_map) + self.assertEqual(len(atom_map), sum(s.number_of_atoms for s in rxn.r_species)) + def test_pairing_reactants_and_products_for_mapping(self): """Test the pairing_reactants_and_products_for_mapping() function""" smiles = ['CC(=O)O[CH]CN', '[H]', '[CH3]'] diff --git a/arc/molecule/converter.py b/arc/molecule/converter.py index 5276ced0a8..c0cef71270 100644 --- a/arc/molecule/converter.py +++ b/arc/molecule/converter.py @@ -57,7 +57,8 @@ def to_rdkit_mol(mol, remove_h=True, return_mapping=False, sanitize=True, save_o # Add the bonds for atom1 in mol.vertices: for atom2, bond in atom1.edges.items(): - if bond.is_hydrogen_bond(): + # Skip non-covalent bonds RDKit cannot represent (H-bonds, vdW, reaction bonds). + if bond.is_hydrogen_bond() or bond.is_van_der_waals() or bond.is_reaction_bond(): continue index1 = atoms.index(atom1) index2 = atoms.index(atom2) diff --git a/arc/reaction/reaction.py b/arc/reaction/reaction.py index 58875496f3..9ce02ebfa1 100644 --- a/arc/reaction/reaction.py +++ b/arc/reaction/reaction.py @@ -941,8 +941,6 @@ def get_bonds(self, A length-2 tuple is which entries represent reactants and product information, respectively. Each entry is a list of tuples, each represents a bond and contains sorted atom indices. """ - if self.atom_map is None: - raise ReactionError('Cannot get bonds without an atom map.') reactants, products = self.get_reactants_and_products() r_bonds, p_bonds = list(), list() len_atoms = 0 @@ -956,6 +954,8 @@ def get_bonds(self, len_atoms = 0 if r_bonds_only: return r_bonds, p_bonds + if self.atom_map is None: + raise ReactionError('Cannot get product bonds without an atom map.') for spc in products: for i, atom_1 in enumerate(spc.mol.atoms): for atom2, bond12 in atom_1.edges.items(): @@ -964,17 +964,82 @@ def get_bonds(self, if bond not in p_bonds: p_bonds.append(bond) len_atoms += spc.number_of_atoms - mapped_p_bonds = list() - for p_bond in p_bonds: - mapped_p_bonds.append(tuple([self.atom_map.index(p_bond[0]), self.atom_map.index(p_bond[1])])) return r_bonds, p_bonds + def _get_reactive_bonds_from_family( + self, + ) -> Optional[Tuple[List[Tuple[int, int]], List[Tuple[int, int]], List[Tuple[int, int]]]]: + """ + Derive formed, broken, and order-changed bonds directly from the RMG family's + recipe actions and ``r_label_map``, bypassing ``atom_map``. + + All tuples are in reactant global index space. This is the canonical source + used elsewhere in ARC (see ``arc.mapping.engine.find_all_breaking_bonds``), + and is sufficient for NMD validation which only needs reactive-atom roles. + + Returns: + Optional[Tuple[formed, broken, changed]]: ``None`` when the family or + ``r_label_map`` is unavailable and the label-based path cannot be taken. + """ + if self.family is None or not self.product_dicts: + return None + r_label_map = self.product_dicts[0].get('r_label_map') + if not r_label_map: + return None + actions = ReactionFamily(label=self.family).actions + formed, broken, changed = list(), list(), list() + for action in actions: + kind = action[0].upper() + if kind not in ('BREAK_BOND', 'FORM_BOND', 'CHANGE_BOND'): + continue + label_1, label_2 = action[1], action[3] + idx_1 = self._resolve_label_index(r_label_map, label_1, preferred_occurrence=0) + idx_2 = self._resolve_label_index( + r_label_map, label_2, + preferred_occurrence=1 if label_1 == label_2 else 0, + ) + if idx_1 is None or idx_2 is None or idx_1 == idx_2: + return None + bond = tuple(sorted((idx_1, idx_2))) + if kind == 'FORM_BOND': + formed.append(bond) + elif kind == 'BREAK_BOND': + broken.append(bond) + else: + changed.append(bond) + return formed, broken, changed + + @staticmethod + def _resolve_label_index( + label_map: Dict[str, int], + label: str, + preferred_occurrence: int = 0, + ) -> Optional[int]: + """Resolve an RMG label (e.g. ``'*1'``) to a global atom index in ``label_map``. + + When the same label appears twice in a recipe (e.g. ``R_Recombination``), + the label map disambiguates duplicates via suffixed keys (``'*'``, ``'*_2'``). + """ + keys = sorted(k for k in label_map if k == label or k.startswith(f'{label}_')) + if not keys: + return None + return label_map[keys[min(preferred_occurrence, len(keys) - 1)]] + def get_formed_and_broken_bonds(self) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: """ Get all bonds that were formed or broken in the reaction. Returns: Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: The formed and broken bonds. """ + if self.atom_map is None and self.family and self.product_dicts: + family_bonds = self._get_reactive_bonds_from_family() + if family_bonds is not None: + formed, broken, _changed = family_bonds + logger.warning( + f'Using RMG-family-derived formed/broken bonds for reaction {self} ' + f'because no atom map is available.' + ) + return formed, broken r_bonds, p_bonds = self.get_bonds() r_bonds, p_bonds = set(r_bonds), set(p_bonds) formed_bonds, broken_bonds = p_bonds - r_bonds, r_bonds - p_bonds @@ -986,6 +1051,11 @@ def get_changed_bonds(self) -> List[Tuple[int, int]]: Returns: List[Tuple[int, int]]: The bonds that change their bond order. """ + if self.atom_map is None and self.family and self.product_dicts: + family_bonds = self._get_reactive_bonds_from_family() + if family_bonds is not None: + _formed, _broken, changed = family_bonds + return changed r_bonds, p_bonds = self.get_bonds() r_bonds, p_bonds = set(r_bonds), set(p_bonds) shared_bonds = p_bonds.intersection(r_bonds) diff --git a/arc/reaction/reaction_test.py b/arc/reaction/reaction_test.py index 0374fb67eb..9fb82eba38 100644 --- a/arc/reaction/reaction_test.py +++ b/arc/reaction/reaction_test.py @@ -912,6 +912,36 @@ def test_get_formed_and_broken_bonds(self): self.assertEqual(formed_bonds, [(0, 5)]) self.assertEqual(broken_bonds, [(3, 5)]) + def test_get_bonds_and_reactive_bonds_without_atom_map(self): + """ + When atom_map cannot be built (e.g. fingerprint-based pair superposition + fails on positional-isomer radicals), get_bonds(r_bonds_only=True) must + still work, and get_formed_and_broken_bonds / get_changed_bonds must fall + back to the RMG family recipe + r_label_map instead of raising. + """ + # Intermolecular H-abstraction between α- and β-radical of pentyl ether. + # map_two_species cannot superimpose the two radical cuts, so atom_map is None. + r1 = ARCSpecies(label='r1', smiles='CC[CH]OCC') + r2 = ARCSpecies(label='r2', smiles='CCCOCC') + p1 = ARCSpecies(label='p1', smiles='C[CH]OCCC') + p2 = ARCSpecies(label='p2', smiles='CCCOCC') + rxn = ARCReaction(r_species=[r1, r2], p_species=[p1, p2]) + self.assertEqual(rxn.family, 'H_Abstraction') + self.assertTrue(rxn.product_dicts) + rxn.atom_map = None # simulate mapping failure + + r_bonds, p_bonds = rxn.get_bonds(r_bonds_only=True) + self.assertGreater(len(r_bonds), 0) + self.assertEqual(p_bonds, []) + + r_label_map = rxn.product_dicts[0]['r_label_map'] + star1, star2, star3 = r_label_map['*1'], r_label_map['*2'], r_label_map['*3'] + + formed, broken = rxn.get_formed_and_broken_bonds() + self.assertEqual(formed, [tuple(sorted((star2, star3)))]) + self.assertEqual(broken, [tuple(sorted((star1, star2)))]) + self.assertEqual(rxn.get_changed_bonds(), []) + def test_get_changed_bonds(self): """Test the get_changed_bonds() function.""" rxn_7 = ARCReaction(r_species=[ARCSpecies(label='C2H5NO2', smiles='[O-][N+](=O)CC', diff --git a/arc/scheduler.py b/arc/scheduler.py index c56fae7d72..8f343797f6 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -828,7 +828,7 @@ def schedule_jobs(self): # Poll active pipe runs (per-run failures are handled inside poll_pipes). if self.active_pipes: - self.pipe_coordinator.poll_pipes() + self.pipe_coordinator.poll_pipes(server_job_ids=self.server_job_ids) # Flush deferred pipe batches (SP, freq, IRC, conf_sp) after all # newly-ready work has been discovered and before the loop sleeps. @@ -1069,7 +1069,8 @@ def end_job(self, job: 'JobAdapter', if job_name in self.running_jobs[label]: self.running_jobs[label].pop(self.running_jobs[label].index(job_name)) - if job.job_status[1]['status'] == 'errored' and job.job_status[1]['keywords'] == ['memory']: + if job.job_status[1]['status'] == 'errored' and len(job.job_status[1]['keywords']) == 1 \ + and job.job_status[1]['keywords'][0].lower() == 'memory': original_mem = job.job_memory_gb if 'insufficient job memory' in job.job_status[1]['error'].lower(): job.job_memory_gb *= 3 @@ -1444,16 +1445,15 @@ def run_sp_job(self, level_of_theory='ccsd/cc-pvdz', job_type='sp') return - if self.species_dict[label].is_monoatomic() and 'dlpno' in level.method: - species = self.species_dict[label] - if species.mol.atoms[0].element.symbol in ('H', 'D', 'T'): - logger.info(f'Using HF/{level.basis} for {label} (single electron, no correlation).') - level = Level(method='hf', basis=level.basis, software=level.software, args=level.args) - else: - canonical_method = level.method.replace('dlpno-', '') - logger.info(f'DLPNO methods are incompatible with monoatomic species {label}. ' - f'Using {canonical_method}/{level.basis} instead.') - level = Level(method=canonical_method, basis=level.basis, software=level.software, args=level.args) + if self.species_dict[label].is_monoatomic() and 'dlpno' in level.method \ + and self.species_dict[label].mol.atoms[0].element.symbol in ('H', 'D', 'T'): + # DLPNO needs electron pairs; fall back to HF for single-electron atoms only. + # Heavier monoatomics (e.g. [O], [N]) run DLPNO fine in ORCA and are left alone. + logger.info(f'Using HF/{level.basis} for {label} (single electron, no correlation).') + level_dict = level.as_dict() + level_dict.pop('method_type', None) # re-deduce after method change + level_dict['method'] = 'hf' + level = Level(repr=level_dict) if self.job_types['sp']: if self.species_dict[label].multi_species: if self.output_multi_spc[self.species_dict[label].multi_species].get('sp', False): @@ -3100,9 +3100,12 @@ def check_directed_scan(self, label, pivots, scan, energies): # a lower conformation was found deg_increment = actions[1] self.species_dict[label].set_dihedral(scan=scan, index=1, deg_increment=deg_increment) - is_isomorphic = self.species_dict[label].check_xyz_isomorphism( - allow_nonisomorphic_2d=self.allow_nonisomorphic_2d, - xyz=self.species_dict[label].initial_xyz) + if self.species_dict[label].is_ts: + is_isomorphic = True + else: + is_isomorphic = self.species_dict[label].check_xyz_isomorphism( + allow_nonisomorphic_2d=self.allow_nonisomorphic_2d, + xyz=self.species_dict[label].initial_xyz) if is_isomorphic: self.delete_all_species_jobs(label) # Remove all completed rotor calculation information @@ -3162,7 +3165,10 @@ def check_directed_scan_job(self, label: str, job: 'JobAdapter'): """ if job.job_status[1]['status'] == 'done': xyz = parser.parse_geometry(log_file_path=job.local_path_to_output_file) - is_isomorphic = self.species_dict[label].check_xyz_isomorphism(xyz=xyz, verbose=False) + if self.species_dict[label].is_ts: + is_isomorphic = True + else: + is_isomorphic = self.species_dict[label].check_xyz_isomorphism(xyz=xyz, verbose=False) for rotor_dict in self.species_dict[label].rotors_dict.values(): if rotor_dict['pivots'] == job.pivots: key = tuple(f'{dihedral:.2f}' for dihedral in job.dihedrals) @@ -3395,9 +3401,12 @@ def troubleshoot_scan_job(self, break else: # If 'change conformer' is not used, check for isomorphism. - is_isomorphic = self.species_dict[label].check_xyz_isomorphism( - allow_nonisomorphic_2d=self.allow_nonisomorphic_2d, - xyz=new_xyz) + if self.species_dict[label].is_ts: + is_isomorphic = True + else: + is_isomorphic = self.species_dict[label].check_xyz_isomorphism( + allow_nonisomorphic_2d=self.allow_nonisomorphic_2d, + xyz=new_xyz) if is_isomorphic: self.species_dict[label].final_xyz = new_xyz # Remove all completed rotor calculation information. diff --git a/arc/scheduler_test.py b/arc/scheduler_test.py index cdc4b17f07..dfc3bd77b7 100644 --- a/arc/scheduler_test.py +++ b/arc/scheduler_test.py @@ -5,8 +5,9 @@ This module contains unit tests for the arc.scheduler module """ +import math import unittest -from unittest.mock import patch +from unittest.mock import MagicMock, patch import os import shutil @@ -18,6 +19,7 @@ from arc.plotter import save_conformers_file from arc.scheduler import Scheduler, species_has_freq, species_has_geo, species_has_sp, species_has_sp_and_freq from arc.imports import settings +from arc.settings.settings import input_filenames from arc.reaction import ARCReaction from arc.species.converter import str_to_xyz from arc.species.species import ARCSpecies, TSGuess @@ -786,15 +788,142 @@ def test_troubleshoot_ess_under_max_attempts(self): level=Level(repr={'method': 'wb97xd', 'basis': 'def2tzvp'}), project_directory=self.project_directory, job_num=201) job.ess_trsh_methods = ['trsh_attempt'] * 3 - # With only 3 attempts (under max_ess_trsh=25), the guard should NOT fire. - # Verify the error message is NOT set (i.e., the guard did not block). - # We use max_attempts - 1 to test just below the threshold. - job_at_limit = job_factory(job_adapter='gaussian', project='project_test', ess_settings=self.ess_settings, - species=[self.spc1], xyz=self.spc1.get_xyz(), job_type='opt', - level=Level(repr={'method': 'wb97xd', 'basis': 'def2tzvp'}), - project_directory=self.project_directory, job_num=202) - job_at_limit.ess_trsh_methods = ['trsh_attempt'] * 24 + job.job_status[1] = {'status': 'errored', 'keywords': ['SCF'], 'error': 'some error', 'line': 'line'} + with patch('arc.scheduler.trsh_ess_job', return_value=([], ['trsh_attempt', 'mock'], False, + Level(repr='wb97xd/def2tzvp'), 'gaussian', 'opt', + False, '', 14, '', 8, False)) as mock_trsh, \ + patch.object(self.sched1, 'run_job') as mock_run_job, \ + patch.object(self.sched1, 'save_restart_dict'): + self.sched1.troubleshoot_ess(label=label, job=job, + level_of_theory=Level(repr='wb97xd/def2tzvp')) + mock_trsh.assert_called_once() + mock_run_job.assert_called_once() self.assertNotIn('ESS troubleshooting attempts exhausted', self.sched1.output[label]['errors']) + + def test_troubleshoot_ess_orca_reduces_cpu_when_memory_is_capped(self): + """Test that ORCA troubleshooting preserves capped total memory and reduces cpu cores.""" + label = 'methylamine' + self.sched1.output = dict() + self.sched1.initialize_output_dict() + + job = MagicMock() + job.job_name = 'sp_a203' + job.job_type = 'sp' + job.job_adapter = 'orca' + job.level = Level(repr={'method': 'dlpno-ccsd(T)'}) + job.server = 'server1' + job.fine = True + job.cpu_cores = 32 + job.job_memory_gb = 250 + job.ess_trsh_methods = list() + job.torsions = None + job.dihedrals = None + job.directed_scan_type = None + job.rotor_index = None + job.job_status = ['done', {'status': 'errored', + 'keywords': ['MDCI', 'Memory', 'max_total_job_memory'], + 'error': 'Orca suggests to increase per cpu core memory to 10218 MB.', + 'line': 'Please increase MaxCore'}] + + with patch.object(self.sched1, 'run_job') as mock_run_job, \ + patch.object(self.sched1, 'save_restart_dict'): + self.sched1.troubleshoot_ess(label=label, job=job, level_of_theory=job.level) + + kwargs = mock_run_job.call_args.kwargs + self.assertEqual(kwargs['cpu_cores'], 24) + self.assertEqual(kwargs['memory'], 250) + self.assertIn('cpu', kwargs['ess_trsh_methods']) + + def test_troubleshoot_ess_orca_increases_total_memory_when_not_capped(self): + """Test that ORCA troubleshooting increases total memory when the node cap was not hit.""" + label = 'methylamine' + self.sched1.output = dict() + self.sched1.initialize_output_dict() + + job = MagicMock() + job.job_name = 'sp_a204' + job.job_type = 'sp' + job.job_adapter = 'orca' + job.level = Level(repr={'method': 'dlpno-ccsd(T)'}) + job.server = 'server1' + job.fine = True + job.cpu_cores = 32 + job.job_memory_gb = 250 + job.ess_trsh_methods = list() + job.torsions = None + job.dihedrals = None + job.directed_scan_type = None + job.rotor_index = None + job.job_status = ['done', {'status': 'errored', + 'keywords': ['MDCI', 'Memory'], + 'error': 'Orca suggests to increase per cpu core memory to 10218 MB.', + 'line': 'Please increase MaxCore'}] + + with patch.object(self.sched1, 'run_job') as mock_run_job, \ + patch.object(self.sched1, 'save_restart_dict'): + self.sched1.troubleshoot_ess(label=label, job=job, level_of_theory=job.level) + + kwargs = mock_run_job.call_args.kwargs + self.assertEqual(kwargs['cpu_cores'], 24) + self.assertEqual(kwargs['memory'], 250) + self.assertIn('memory', kwargs['ess_trsh_methods']) + + def test_troubleshoot_ess_orca_rewrites_input_with_reduced_cores_and_higher_maxcore(self): + """Test ORCA troubleshooting end-to-end from failure to rewritten input file.""" + label = 'methylamine' + self.sched1.output = dict() + self.sched1.initialize_output_dict() + + job = MagicMock() + job.job_name = 'sp_a205' + job.job_type = 'sp' + job.job_adapter = 'orca' + job.level = Level(repr={'method': 'dlpno-ccsd(T)'}) + job.server = 'server1' + job.fine = True + job.cpu_cores = 32 + job.job_memory_gb = 250 + job.ess_trsh_methods = list() + job.torsions = None + job.dihedrals = None + job.directed_scan_type = None + job.rotor_index = None + job.job_status = ['done', {'status': 'errored', + 'keywords': ['MDCI', 'Memory', 'max_total_job_memory'], + 'error': 'Orca suggests to increase per cpu core memory to 10218 MB.', + 'line': 'Please increase MaxCore'}] + + with patch.object(self.sched1, 'run_job') as mock_run_job, \ + patch.object(self.sched1, 'save_restart_dict'): + self.sched1.troubleshoot_ess(label=label, job=job, level_of_theory=job.level) + + kwargs = mock_run_job.call_args.kwargs + temp_project_dir = os.path.join(ARC_TESTING_PATH, 'test_scheduler_orca_trsh_input') + try: + rerun_job = job_factory(job_adapter=kwargs['job_adapter'], + project='project_test_scheduler_orca_trsh_input', + ess_settings=self.ess_settings, + species=[self.spc1], + xyz=self.spc1.get_xyz(), + job_type=kwargs['job_type'], + level=kwargs['level_of_theory'], + project_directory=temp_project_dir, + cpu_cores=kwargs['cpu_cores'], + job_memory_gb=kwargs['memory'], + ess_trsh_methods=kwargs['ess_trsh_methods'], + execution_type='incore', + fine=kwargs['fine'], + server=job.server, + testing=True) + rerun_job.write_input_file() + with open(os.path.join(rerun_job.local_path, input_filenames[rerun_job.job_adapter]), 'r') as f: + content = f.read() + original_maxcore = math.ceil(rerun_job.job_memory_gb * 1024 / job.cpu_cores) + self.assertIn('%pal nprocs 24 end', content) + self.assertIn(f'%maxcore {rerun_job.input_file_memory}', content) + self.assertGreater(rerun_job.input_file_memory, original_maxcore) + finally: + shutil.rmtree(temp_project_dir, ignore_errors=True) @patch('arc.scheduler.Scheduler.run_opt_job') def test_switch_ts_cleanup(self, mock_run_opt): @@ -1005,6 +1134,91 @@ def test_switch_ts_rotors_reset(self, mock_run_opt): # rotors_dict=None must be preserved — do not re-enable rotor scans. self.assertIsNone(sched2.species_dict[ts_label2].rotors_dict) + def test_check_directed_scan_job_skips_isomorphism_for_ts(self): + """check_directed_scan_job must not call check_xyz_isomorphism for a TS; is_isomorphic is recorded as True.""" + ts_xyz = str_to_xyz("""N 0.91779059 0.51946178 0.00000000 + H 1.81402049 1.03819414 0.00000000 + H 0.00000000 0.00000000 0.00000000 + H 0.91779059 1.22790192 0.72426890""") + ts_spc = ARCSpecies(label='TS_dirscan', is_ts=True, xyz=ts_xyz, multiplicity=1, charge=0, + compute_thermo=False) + ts_spc.rotors_dict = {0: {'pivots': [1, 2], 'directed_scan': {}}} + + project_directory = os.path.join(ARC_PATH, 'Projects', 'arc_project_ts_iso_dirscan') + self.addCleanup(shutil.rmtree, project_directory, ignore_errors=True) + sched = Scheduler(project='test_ts_iso_dirscan', ess_settings=self.ess_settings, + species_list=[ts_spc], + opt_level=Level(repr=default_levels_of_theory['opt']), + freq_level=Level(repr=default_levels_of_theory['freq']), + sp_level=Level(repr=default_levels_of_theory['sp']), + ts_guess_level=Level(repr=default_levels_of_theory['ts_guesses']), + project_directory=project_directory, + testing=True, + job_types=self.job_types1, + ) + + job_mock = MagicMock() + job_mock.job_status = [None, {'status': 'done'}] + job_mock.local_path_to_output_file = '/fake/path.log' + job_mock.pivots = [1, 2] + job_mock.dihedrals = [45.0] + job_mock.ess_trsh_methods = [] + + with patch('arc.species.species.ARCSpecies.check_xyz_isomorphism') as mock_iso, \ + patch('arc.scheduler.parser.parse_geometry', return_value=ts_xyz), \ + patch('arc.scheduler.parser.parse_e_elect', return_value=-123.45): + sched.check_directed_scan_job(label='TS_dirscan', job=job_mock) + + mock_iso.assert_not_called() + recorded = sched.species_dict['TS_dirscan'].rotors_dict[0]['directed_scan'][('45.00',)] + self.assertTrue(recorded['is_isomorphic']) + + @patch('arc.scheduler.Scheduler.run_opt_job') + def test_troubleshoot_scan_job_skips_isomorphism_for_ts(self, mock_run_opt): + """troubleshoot_scan_job must not call check_xyz_isomorphism for a TS when applying 'change conformer'.""" + ts_xyz = str_to_xyz("""N 0.91779059 0.51946178 0.00000000 + H 1.81402049 1.03819414 0.00000000 + H 0.00000000 0.00000000 0.00000000 + H 0.91779059 1.22790192 0.72426890""") + new_xyz = str_to_xyz("""N 0.91000000 0.52000000 0.00000000 + H 1.81000000 1.04000000 0.00000000 + H 0.00000000 0.00000000 0.00000000 + H 0.91000000 1.23000000 0.72000000""") + ts_spc = ARCSpecies(label='TS_trsh', is_ts=True, xyz=ts_xyz, multiplicity=1, charge=0, + compute_thermo=False) + ts_spc.rotors_dict = {0: {'pivots': [1, 2], 'scan': [3, 1, 2, 4], 'scan_path': '', + 'invalidation_reason': '', 'success': None, 'symmetry': None, + 'times_dihedral_set': 0, 'trsh_methods': [], 'trsh_counter': 0}} + + project_directory = os.path.join(ARC_PATH, 'Projects', 'arc_project_ts_iso_trsh') + self.addCleanup(shutil.rmtree, project_directory, ignore_errors=True) + sched = Scheduler(project='test_ts_iso_trsh', ess_settings=self.ess_settings, + species_list=[ts_spc], + opt_level=Level(repr=default_levels_of_theory['opt']), + freq_level=Level(repr=default_levels_of_theory['freq']), + sp_level=Level(repr=default_levels_of_theory['sp']), + ts_guess_level=Level(repr=default_levels_of_theory['ts_guesses']), + project_directory=project_directory, + testing=True, + job_types=self.job_types1, + ) + sched.trsh_ess_jobs = True + sched.trsh_rotors = True + + job_mock = MagicMock() + job_mock.species_label = 'TS_trsh' + job_mock.rotor_index = 0 + job_mock.torsions = [[3, 1, 2, 4]] + job_mock.job_name = 'scan_a200' + + with patch('arc.species.species.ARCSpecies.check_xyz_isomorphism') as mock_iso, \ + patch('arc.scheduler.Scheduler.delete_all_species_jobs'): + sched.troubleshoot_scan_job(job=job_mock, methods={'change conformer': new_xyz}) + + mock_iso.assert_not_called() + self.assertEqual(sched.species_dict['TS_trsh'].final_xyz, new_xyz) + mock_run_opt.assert_called_once() + @classmethod def tearDownClass(cls): """ diff --git a/arc/scripts/pipe_worker.py b/arc/scripts/pipe_worker.py index 0a8112aac9..41c452aaca 100644 --- a/arc/scripts/pipe_worker.py +++ b/arc/scripts/pipe_worker.py @@ -43,6 +43,44 @@ logger = logging.getLogger('pipe_worker') +_diagnostics_logged = False + + +def _log_node_diagnostics() -> None: + """Dump hostname, PBS/Slurm array context, PATH, PYTHONPATH, TMPDIR, and + group membership on first task failure. A dead compute node otherwise + leaves no trace of *which* node it was — the PBS `.e` file records the + array index but not the hostname, and ``tracejob`` is admin-only on + many sites. Logging once per worker process keeps the volume bounded + when one bad node drains many tasks. + """ + global _diagnostics_logged + if _diagnostics_logged: + return + _diagnostics_logged = True + import socket + import subprocess + try: + host = socket.gethostname() + except Exception: + host = 'unknown' + logger.error('--- NODE DIAGNOSTICS (first task failure on this worker) ---') + logger.error(f'hostname={host}') + for k in ('PBS_JOBID', 'PBS_ARRAY_INDEX', 'PBS_O_WORKDIR', + 'SLURM_JOB_ID', 'SLURM_ARRAY_TASK_ID', 'SLURM_NODELIST'): + v = os.environ.get(k) + if v is not None: + logger.error(f'{k}={v}') + logger.error(f'PATH={os.environ.get("PATH", "")}') + logger.error(f'PYTHONPATH={os.environ.get("PYTHONPATH", "")}') + logger.error(f'TMPDIR={os.environ.get("TMPDIR", "")}') + try: + id_out = subprocess.run(['id'], capture_output=True, text=True, timeout=5).stdout.strip() + logger.error(f'id={id_out}') + except Exception as exc: + logger.error(f'id=') + logger.error('--- END NODE DIAGNOSTICS ---') + def setup_logging(log_path: str) -> None: """Configure logging. Safe to call multiple times.""" @@ -202,6 +240,7 @@ def run_task(pipe_root: str, task_id: str, state: TaskStateRecord, failure_class = type(e).__name__ ended_at = time.time() logger.error(f'Task {task_id} failed: {failure_class}: {e}') + _log_node_diagnostics() if scratch_dir: _copy_outputs(scratch_dir, attempt_dir) result = locals().get('result') or _make_result_template(task_id, state.attempt_index, started_at) diff --git a/arc/settings/crest.py b/arc/settings/crest.py new file mode 100644 index 0000000000..ebd227fa53 --- /dev/null +++ b/arc/settings/crest.py @@ -0,0 +1,113 @@ +""" +Utilities for locating CREST executables and activation commands. +""" + +import os +import re +import shutil +import sys +from typing import Optional, Tuple + + +def parse_version(folder_name: str) -> Tuple[int, int, int]: + """ + Parse a version from a folder name. + + Supports patterns such as ``3.0.2``, ``v212``, ``2.1``, ``2``. + """ + version_regex = re.compile(r"(?:v?(\d+)(?:\.(\d+))?(?:\.(\d+))?)", re.IGNORECASE) + match = version_regex.search(folder_name) + if not match: + return 0, 0, 0 + + major = int(match.group(1)) if match.group(1) else 0 + minor = int(match.group(2)) if match.group(2) else 0 + patch = int(match.group(3)) if match.group(3) else 0 + + # Example: v212 -> (2, 1, 2) + if major >= 100 and match.group(2) is None and match.group(3) is None: + s = str(major).rjust(3, "0") + major, minor, patch = int(s[0]), int(s[1]), int(s[2]) + + return major, minor, patch + + +def find_highest_version_in_directory(directory: str, name_contains: str) -> Optional[str]: + """ + Find the ``crest`` executable under the highest-version matching subdirectory. + """ + if not os.path.exists(directory): + return None + + highest_version_path = None + highest_version = () + for folder in os.listdir(directory): + file_path = os.path.join(directory, folder) + if name_contains.lower() in folder.lower() and os.path.isdir(file_path): + crest_path = os.path.join(file_path, "crest") + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + version = parse_version(folder) + if highest_version == () or version > highest_version: + highest_version = version + highest_version_path = crest_path + return highest_version_path + + +def find_crest_executable() -> Tuple[Optional[str], Optional[str]]: + """ + Return ``(crest_path, env_cmd)``. + + ``env_cmd`` is a shell snippet to activate the environment if needed, otherwise ``""``. + """ + # Priority 1: standalone builds in a configurable directory (default: /Local/ce_dana) + standalone_dir = os.getenv("ARC_CREST_STANDALONE_DIR", "/Local/ce_dana") + crest_path = find_highest_version_in_directory(standalone_dir, "crest") + if crest_path and os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + return crest_path, "" + + # Priority 2: Conda/Mamba/Micromamba envs + home = os.path.expanduser("~") + potential_env_paths = [ + os.path.join(home, "anaconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniconda3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "miniforge3", "envs", "crest_env", "bin", "crest"), + os.path.join(home, ".conda", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "mambaforge", "envs", "crest_env", "bin", "crest"), + os.path.join(home, "micromamba", "envs", "crest_env", "bin", "crest"), + ] + + current_env_bin = os.path.dirname(sys.executable) + potential_env_paths.insert(0, os.path.join(current_env_bin, "crest")) + + for crest_path in potential_env_paths: + if os.path.isfile(crest_path) and os.access(crest_path, os.X_OK): + env_marker = os.path.join("envs", "crest_env") + os.path.sep + env_root = crest_path.split(env_marker)[0] + if "micromamba" in crest_path: + env_cmd = ( + f"source {env_root}/etc/profile.d/micromamba.sh && " + f"micromamba activate crest_env" + ) + elif any(name in env_root for name in ("anaconda3", "miniconda3", "miniforge3", "mambaforge", ".conda")): + env_cmd = ( + f"source {env_root}/etc/profile.d/conda.sh && " + f"conda activate crest_env" + ) + else: + env_cmd = "" + return crest_path, env_cmd + + # Priority 3: PATH + crest_in_path = shutil.which("crest") + if crest_in_path: + return crest_in_path, "" + + return None, None + + +__all__ = [ + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", +] + diff --git a/arc/settings/crest_test.py b/arc/settings/crest_test.py new file mode 100644 index 0000000000..d7793604ed --- /dev/null +++ b/arc/settings/crest_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.settings.crest +""" + +import os +import stat +import tempfile +import unittest +from unittest.mock import patch + +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) + + +class TestCrestSettingsUtils(unittest.TestCase): + + def _make_executable(self, path: str): + with open(path, "w") as f: + f.write("#!/bin/bash\n") + st = os.stat(path) + os.chmod(path, st.st_mode | stat.S_IXUSR) + + def test_parse_version(self): + self.assertEqual(parse_version("crest-3.0.2"), (3, 0, 2)) + self.assertEqual(parse_version("v212"), (2, 1, 2)) + self.assertEqual(parse_version("version-2.1"), (2, 1, 0)) + self.assertEqual(parse_version("foo"), (0, 0, 0)) + + def test_find_highest_version_in_directory(self): + with tempfile.TemporaryDirectory() as td: + low = os.path.join(td, "crest-2.1") + high = os.path.join(td, "crest-3.0.2") + os.makedirs(low) + os.makedirs(high) + self._make_executable(os.path.join(low, "crest")) + self._make_executable(os.path.join(high, "crest")) + + found = find_highest_version_in_directory(td, "crest") + self.assertEqual(found, os.path.join(high, "crest")) + + def test_find_crest_executable_prefers_standalone(self): + with tempfile.TemporaryDirectory() as td: + standalone = os.path.join(td, "crest-3.0.2") + os.makedirs(standalone) + standalone_crest = os.path.join(standalone, "crest") + self._make_executable(standalone_crest) + + with patch.dict(os.environ, {"ARC_CREST_STANDALONE_DIR": td}, clear=False): + path, env_cmd = find_crest_executable() + self.assertEqual(path, standalone_crest) + self.assertEqual(env_cmd, "") + + def test_find_crest_executable_env_detection(self): + with tempfile.TemporaryDirectory() as td: + fake_home = os.path.join(td, "home") + os.makedirs(fake_home) + crest_path = os.path.join(fake_home, "miniforge3", "envs", "crest_env", "bin", "crest") + os.makedirs(os.path.dirname(crest_path), exist_ok=True) + self._make_executable(crest_path) + + with patch("arc.settings.crest.os.path.expanduser", return_value=fake_home): + with patch("arc.settings.crest.sys.executable", os.path.join(td, "python")): + with patch("arc.settings.crest.shutil.which", return_value=None): + path, env_cmd = find_crest_executable() + self.assertEqual(path, crest_path) + self.assertIn("conda activate crest_env", env_cmd) + + +if __name__ == "__main__": + unittest.main() + diff --git a/arc/settings/settings.py b/arc/settings/settings.py index 8e70390c35..305e180b5f 100644 --- a/arc/settings/settings.py +++ b/arc/settings/settings.py @@ -9,6 +9,12 @@ import os import string import sys +import shutil +from arc.settings.crest import ( + find_crest_executable, + find_highest_version_in_directory, + parse_version, +) # Users should update the following server dictionary. # Instructions for RSA key generation can be found here: @@ -89,7 +95,7 @@ supported_ess = ['cfour', 'gaussian', 'mockter', 'molpro', 'orca', 'qchem', 'terachem', 'onedmin', 'xtb', 'torchani', 'openbabel'] # TS methods to try when appropriate for a reaction (other than user guesses which are always allowed): -ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm', 'orca_neb'] +ts_adapters = ['heuristics', 'AutoTST', 'GCN', 'xtb_gsm', 'orca_neb', 'crest'] # List here job types to execute by default default_job_types = {'conf_opt': True, # defaults to True if not specified @@ -453,3 +459,57 @@ def add_rmg_db_candidates(prefix: str) -> None: if path and os.path.isdir(path): RMG_DB_PATH = path break + +CREST_PATH, CREST_ENV_PATH = find_crest_executable() + +__all__ = [ + "servers", + "global_ess_settings", + "supported_ess", + "ts_adapters", + "default_job_types", + "levels_ess", + "check_status_command", + "submit_command", + "delete_command", + "list_available_nodes_command", + "submit_filenames", + "t_max_format", + "input_filenames", + "output_filenames", + "default_levels_of_theory", + "orca_default_options_dict", + "tani_default_options_dict", + "ob_default_settings", + "xtb_gsm_settings", + "valid_chars", + "rotor_scan_resolution", + "maximum_barrier", + "minimum_barrier", + "inconsistency_az", + "inconsistency_ab", + "max_rotor_trsh", + "preserve_params_in_scan", + "workers_coeff", + "default_job_settings", + "ARC_FAMILIES_PATH", + "home", + "TANI_PYTHON", + "OB_PYTHON", + "TS_GCN_PYTHON", + "AUTOTST_PYTHON", + "ARC_PYTHON", + "RMG_ENV_NAME", + "RMG_PYTHON", + "XTB", + "exported_rmg_path", + "exported_rmg_db_path", + "gw", + "find_executable", + "add_rmg_db_candidates", + "parse_version", + "find_highest_version_in_directory", + "find_crest_executable", + "CREST_PATH", + "CREST_ENV_PATH", +] diff --git a/arc/species/converter.py b/arc/species/converter.py index ce0d484541..9d19a4b13d 100644 --- a/arc/species/converter.py +++ b/arc/species/converter.py @@ -5,6 +5,7 @@ import math import numpy as np import os +import warnings from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union from ase import Atoms @@ -48,6 +49,103 @@ DIST_PRECISION = 0.01 # Angstrom ANGL_PRECISION = 0.1 # rad (for both bond angle and dihedral) +def reorder_xyz_string(xyz_str: str, + reverse_atoms: bool = False, + units: str = 'angstrom', + convert_to: str = 'angstrom', + project_directory: Optional[str] = None + ) -> str: + """ + Reorder an XYZ string between ``ATOM X Y Z`` and ``X Y Z ATOM`` with optional unit conversion. + + Args: + xyz_str (str): The string xyz format to be converted. + reverse_atoms (bool, optional): Whether to reverse the atoms and coordinates. + units (str, optional): Units of the input coordinates ('angstrom' or 'bohr'). + convert_to (str, optional): The units to convert to (either 'angstrom' or 'bohr'). + project_directory (str, optional): The path to the project directory. + + Raises: + ConverterError: If xyz_str is not a string or does not have four space-separated entries per non-empty line. + + Returns: str + The converted string xyz format. + """ + if isinstance(xyz_str, tuple): + xyz_str = '\n'.join(xyz_str) + if isinstance(xyz_str, list): + xyz_str = '\n'.join(xyz_str) + if not isinstance(xyz_str, str): + raise ConverterError(f'Expected a string input, got {type(xyz_str)}') + if project_directory is not None: + file_path = os.path.join(project_directory, xyz_str) + if os.path.isfile(file_path): + with open(file_path, 'r') as f: + xyz_str = f.read() + + + if units.lower() == 'angstrom' and convert_to.lower() == 'angstrom': + conversion_factor = 1 + elif units.lower() == 'bohr' and convert_to.lower() == 'bohr': + conversion_factor = 1 + elif units.lower() == 'angstrom' and convert_to.lower() == 'bohr': + conversion_factor = constants.angstrom_to_bohr + elif units.lower() == 'bohr' and convert_to.lower() == 'angstrom': + conversion_factor = constants.bohr_to_angstrom + else: + raise ConverterError("Invalid target unit. Choose 'angstrom' or 'bohr'.") + + processed_lines = list() + # Split the string into lines + lxyz = xyz_str.strip().splitlines() + # Determine whether the atom label appears first or last in each line + first_line_tokens = lxyz[0].strip().split() + atom_first = not is_str_float(first_line_tokens[0]) + + for item in lxyz: + parts = item.strip().split() + + if len(parts) != 4: + raise ConverterError(f'xyz_str has an incorrect format, expected 4 elements in each line, ' + f'got "{item}" in:\n{xyz_str}') + if atom_first: + atom, x_str, y_str, z_str = parts + else: + x_str, y_str, z_str, atom = parts + + try: + x = float(x_str) * conversion_factor + y = float(y_str) * conversion_factor + z = float(z_str) * conversion_factor + + except ValueError as e: + raise ConverterError(f'Could not convert {x_str}, {y_str}, or {z_str} to floats.') from e + + if reverse_atoms and atom_first: + formatted_line = f'{x} {y} {z} {atom}' + elif reverse_atoms and not atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and atom_first: + formatted_line = f'{atom} {x} {y} {z}' + elif not reverse_atoms and not atom_first: + formatted_line = f'{x} {y} {z} {atom}' + + processed_lines.append(formatted_line) + + return '\n'.join(processed_lines) + + +def str_to_str(*args, **kwargs) -> str: + """ + Backwards compatible wrapper for reorder_xyz_string. + """ + warnings.warn( + "str_to_str was renamed to reorder_xyz_string and will be removed in a future ARC release", + DeprecationWarning, + stacklevel=2, + ) + return reorder_xyz_string(*args, **kwargs) + def str_to_xyz(xyz_str: str, project_directory: Optional[str] = None, diff --git a/arc/species/converter_test.py b/arc/species/converter_test.py index aa881bdcac..f423c4d500 100644 --- a/arc/species/converter_test.py +++ b/arc/species/converter_test.py @@ -18,6 +18,7 @@ import arc.species.converter as converter from arc.common import ARC_PATH, ARC_TESTING_PATH, almost_equal_coords, almost_equal_coords_lists, almost_equal_lists +from arc.constants import angstrom_to_bohr from arc.exceptions import ConverterError from arc.molecule.molecule import Molecule from arc.species.perceive import perceive_molecule_from_xyz @@ -700,6 +701,37 @@ def test_str_to_xyz(self): xyz = converter.str_to_xyz(xyz_format) self.assertEqual(xyz, expected_xyz) + def test_reorder_xyz_string_atom_first(self): + """Test reordering atom-first XYZ strings with unit conversion""" + xyz_format = "C 0.0 1.0 2.0\nH -1.0 0.5 0.0" + converted = converter.reorder_xyz_string(xyz_str=xyz_format, reverse_atoms=True, convert_to="bohr") + converted_lines = converted.splitlines() + self.assertEqual(len(converted_lines), 2) + + x1, y1, z1, s1 = converted_lines[0].split() + self.assertEqual(s1, "C") + self.assertAlmostEqual(float(x1), 0.0) + self.assertAlmostEqual(float(y1), 1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(z1), 2.0 * angstrom_to_bohr) + + x2, y2, z2, s2 = converted_lines[1].split() + self.assertEqual(s2, "H") + self.assertAlmostEqual(float(x2), -1.0 * angstrom_to_bohr) + self.assertAlmostEqual(float(y2), 0.5 * angstrom_to_bohr) + self.assertAlmostEqual(float(z2), 0.0) + + def test_reorder_xyz_string_coordinate_first(self): + """Test reordering coordinate-first XYZ strings back to atom-last order with conversion""" + xyz_format = "0.0 0.0 0.0 N\n1.0 0.0 0.0 H" + converted = converter.reorder_xyz_string( + xyz_str=xyz_format, + reverse_atoms=False, + units="bohr", + convert_to="angstrom", + ) + expected = "0.0 0.0 0.0 N\n0.529177 0.0 0.0 H" + self.assertEqual(converted, expected) + def test_xyz_to_str(self): """Test converting an ARC xyz format to a string xyz format""" xyz_str1 = converter.xyz_to_str(xyz_dict=self.xyz1['dict']) diff --git a/arc/species/perceive.py b/arc/species/perceive.py index ceed7a0e99..82c09d4992 100644 --- a/arc/species/perceive.py +++ b/arc/species/perceive.py @@ -33,6 +33,7 @@ def perceive_molecule_from_xyz( n_radicals: int | None = None, n_fragments: int | None = None, single_bond_tolerance: float = 1.20, + is_ts: bool = False, ) -> Molecule | None: """ Infer a chemically valid Molecule with localized Lewis structure from Cartesian coordinates. @@ -119,7 +120,7 @@ def perceive_molecule_from_xyz( # if we expected multiple fragments, hand off to the multi‐frag helper if len(fragments) != 1: if len(fragments) == n_fragments: - return _combine_fragments(symbols, coords, fragments, charge) + return _combine_fragments(symbols, coords, fragments, charge, is_ts=is_ts) return None # otherwise fall back on the existing single‐molecule code @@ -149,6 +150,7 @@ def _combine_fragments( coords: tuple[tuple[float, float, float], ...], fragments: list[list[int]], total_charge: int, + is_ts: bool = False, ) -> Molecule: """ Build disconnected fragments separately, then stitch them into one Molecule with charges distributed. @@ -273,7 +275,7 @@ def _perceive_frag(idxs: list[int], charge: int) -> Molecule | None: best_mol.multiplicity = max(sm.multiplicity for sm in submols) assign_formal_charges(best_mol) enforce_target_charge(best_mol, total_charge) - _add_interfragment_bonds(best_mol, fragments, coords) + _add_interfragment_bonds(best_mol, fragments, coords, is_ts=is_ts) # restore original atom order idx_map: dict[int, int] = dict() @@ -297,13 +299,16 @@ def _add_interfragment_bonds( mol: Molecule, fragments: list[list[int]], coords: tuple[tuple[float, float, float], ...], + is_ts: bool = False, ) -> None: """ Connect separate fragments in a molecule by adding inter-fragment bonds. For each adjacent fragment pair, the algorithm finds the closest atom pair (based on `coords`) and adds a bond between them: - • If at least one atom is a radical, use bond order = 1.0 by default. + • For a TS, always use bond order = 0.05 (reaction bond) since the + inter-fragment contact is the breaking/forming bond by construction. + • Otherwise, if at least one atom is a radical, use bond order = 1.0 by default. If neither has available valence, use bond order = 0.05 (weak link). • If neither atom is a radical, set bond order = 0.0 (no real bond). @@ -337,7 +342,9 @@ def _add_interfragment_bonds( a1 = mol.atoms[idx_map[i0]] a2 = mol.atoms[idx_map[j0]] - if a1.radical_electrons > 0 or a2.radical_electrons > 0: + if is_ts: + bond_order = 0.05 + elif a1.radical_electrons > 0 or a2.radical_electrons > 0: bond_order = 1.0 if n_missing_electrons(a1) or n_missing_electrons(a2) else 0.05 else: bond_order = 0.0 diff --git a/arc/species/species.py b/arc/species/species.py index 76a9105dfd..a6759ce163 100644 --- a/arc/species/species.py +++ b/arc/species/species.py @@ -1432,6 +1432,7 @@ def set_dihedral(self, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals, n_fragments=self.get_n_fragments(), + is_ts=self.is_ts, ) if chk_rotor_list: for rotor in self.rotors_dict.values(): @@ -1617,6 +1618,8 @@ def mol_from_xyz(self, Important for TS searches and for identifying rotor indices. This works by generating a molecule from xyz and using the 2D structure to confirm that the perceived molecule is correct. + For TSs, the perceived molecule is accepted without enforcing + 2D-graph isomorphism, since TS connectivity is not strictly defined. If ``xyz`` is not given, the species xyz attribute will be used. Args: @@ -1637,36 +1640,42 @@ def mol_from_xyz(self, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals, n_fragments=self.get_n_fragments(), + is_ts=self.is_ts, ) if perceived_mol is not None: - allow_nonisomorphic_2d = (self.charge is not None and self.charge) \ - or self.mol.has_charge() or perceived_mol.has_charge() \ - or (self.multiplicity is not None and self.multiplicity >= 3) \ - or self.mol.multiplicity >= 3 or perceived_mol.multiplicity >= 3 - isomorphic = self.check_xyz_isomorphism(mol=perceived_mol, - xyz=xyz, - allow_nonisomorphic_2d=allow_nonisomorphic_2d) - if not isomorphic: - logger.warning(f'XYZ and the 2D graph representation for {self.label} are not isomorphic.\nGot ' - f'xyz:\n{xyz}\n\nwhich corresponds to {self.mol.copy(deep=True).to_smiles()}\n' - f'{self.mol.copy(deep=True).to_adjacency_list()}\n\nand: ' - f'{self.mol.copy(deep=True).to_smiles()}\n' - f'{self.mol.copy(deep=True).to_adjacency_list()}') - raise SpeciesError(f'XYZ and the 2D graph representation for {self.label} are not compliant.') - if not self.keep_mol: - if is_mol_valid(perceived_mol, charge=self.charge, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals): + if self.is_ts: + if not self.keep_mol: self.mol = perceived_mol - else: - try: - order_atoms(ref_mol=perceived_mol, mol=self.mol) - except SanitizationError: + else: + allow_nonisomorphic_2d = (self.charge is not None and self.charge) \ + or self.mol.has_charge() or perceived_mol.has_charge() \ + or (self.multiplicity is not None and self.multiplicity >= 3) \ + or self.mol.multiplicity >= 3 or perceived_mol.multiplicity >= 3 + isomorphic = self.check_xyz_isomorphism(mol=perceived_mol, + xyz=xyz, + allow_nonisomorphic_2d=allow_nonisomorphic_2d) + if not isomorphic: + logger.warning(f'XYZ and the 2D graph representation for {self.label} are not isomorphic.\nGot ' + f'xyz:\n{xyz}\n\nwhich corresponds to {self.mol.copy(deep=True).to_smiles()}\n' + f'{self.mol.copy(deep=True).to_adjacency_list()}\n\nand: ' + f'{perceived_mol.copy(deep=True).to_smiles()}\n' + f'{perceived_mol.copy(deep=True).to_adjacency_list()}') + raise SpeciesError(f'XYZ and the 2D graph representation for {self.label} are not compliant.') + if not self.keep_mol: + if is_mol_valid(perceived_mol, charge=self.charge, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals): self.mol = perceived_mol + else: + try: + order_atoms(ref_mol=perceived_mol, mol=self.mol) + except SanitizationError: + self.mol = perceived_mol else: perceived_mol = perceive_molecule_from_xyz(xyz, charge=self.charge, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals, n_fragments=self.get_n_fragments(), + is_ts=self.is_ts, ) if perceived_mol is None and self.is_ts: perceived_mol = perceive_molecule_from_xyz(xyz, @@ -1674,6 +1683,7 @@ def mol_from_xyz(self, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals, n_fragments=2, + is_ts=True, ) if perceived_mol is not None: self.mol = perceived_mol @@ -1847,6 +1857,7 @@ def check_xyz_isomorphism(self, multiplicity=self.multiplicity, n_radicals=self.number_of_radicals, n_fragments=self.get_n_fragments(), + is_ts=self.is_ts, ) # 2. A. Check isomorphism with bond orders using b_mol diff --git a/arc/species/species_test.py b/arc/species/species_test.py index 466217ff36..3e98186121 100644 --- a/arc/species/species_test.py +++ b/arc/species/species_test.py @@ -1684,6 +1684,43 @@ def test_mol_from_xyz(self): radical_count += atom.radical_electrons self.assertEqual(radical_count, 2) + def test_ts_mol_from_xyz_skips_isomorphism_enforcement(self): + """Test that a TS accepts the perceived xyz-derived molecule even if a stored 2D graph disagrees.""" + xyz = {'symbols': ('O', 'O', 'H', 'C', 'C', 'C', 'C', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H'), + 'isotopes': (16, 16, 1, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), + 'coords': ((1.570004, 0.919237, -0.063628), (2.346384, -0.215837, 0.116826), + (2.578713, -0.467096, -0.784756), (-0.673058, -1.117816, -1.045216), + (-0.76037, -0.036132, 0.001231), (-0.850886, -0.54288, 1.418092), + (-1.694638, 1.099253, -0.333714), (-1.612555, -1.681809, -1.088997), + (-0.491563, -0.700452, -2.03735), (0.122945, -1.827698, -0.811666), + (0.427835, 0.508898, -0.034066), (-0.034676, -1.231859, 1.641118), + (-1.795224, -1.080235, 1.568607), (-0.814118, 0.27711, 2.136337), + (-2.733958, 0.749029, -0.320335), (-1.610541, 1.910407, 0.391085), + (-1.494252, 1.501954, -1.327915))} + adj = """multiplicity 2 +1 O u0 p2 c0 {2,S} {17,S} +2 O u1 p2 c0 {1,S} +3 C u0 p0 c0 {4,S} {5,S} {6,S} {7,S} +4 C u0 p0 c0 {3,S} {8,S} {9,S} {10,S} +5 C u0 p0 c0 {3,S} {11,S} {12,S} {13,S} +6 C u0 p0 c0 {3,S} {14,S} {15,S} {16,S} +7 H u0 p0 c0 {3,S} +8 H u0 p0 c0 {4,S} +9 H u0 p0 c0 {4,S} +10 H u0 p0 c0 {4,S} +11 H u0 p0 c0 {5,S} +12 H u0 p0 c0 {5,S} {17,vdW} +13 H u0 p0 c0 {5,S} +14 H u0 p0 c0 {6,S} +15 H u0 p0 c0 {6,S} +16 H u0 p0 c0 {6,S} +17 H u0 p0 c0 {1,S} {12,vdW}""" + + spc = ARCSpecies(label='TS0', adjlist=adj, xyz=xyz, is_ts=True, multiplicity=2, charge=0) + self.assertIn('3 H u0 p0 c0 {2,S}', spc.mol.to_adjacency_list()) + self.assertIn('11 H u0 p0 c0 {1,S} {5,S}', spc.mol.to_adjacency_list()) + self.assertNotIn('{17,vdW}', spc.mol.to_adjacency_list()) + def test_consistent_atom_order(self): """Test that the atom order is preserved whether starting from SMILES or from xyz""" xyz9 = """O -1.17310019 -0.30822930 0.16269772 diff --git a/arc/testing/server/pbs/timelimit/err.txt b/arc/testing/server/pbs/timelimit/err.txt deleted file mode 100644 index 17a55b3536..0000000000 --- a/arc/testing/server/pbs/timelimit/err.txt +++ /dev/null @@ -1,17 +0,0 @@ -=>> PBS: job killed: walltime 86415 exceeded limit 86400 -Error: software termination - rax fffffffffffffffc, rbx 00007ffc0d4f90d0, rcx ffffffffffffffff - rdx 0000000000000000, rsp 00007ffc0d4f9098, rbp 0000000000000001 - rsi 00007ffc0d4f90d0, rdi 0000000000038f1b, r8 00002b7af22a5700 - r9 0000000000000000, r10 0000000000000000, r11 0000000000000246 - r12 00007ffc0d4f90f0, r13 000000000000008f, r14 0000000000000000 - r15 00007ffc0d4fff40 -Error: software termination - rax 0000000000024fa8, rbx 00002ae812e9f2c0, rcx 0000000000035498 - rdx 00002ae8c4888bd0, rsp 00007ffde70fb680, rbp 00007ffde70fbf70 - rsi 00002ae8c48be068, rdi 00002ae8c48f3508, r8 00002ae8c49289b0 - r9 0000000000006a93, r10 0000000000006a95, r11 00002ae812ed4768 - r12 00002ae812f66508, r13 00002ae812f9b9b0, r14 0000000000006a92 - r15 00002ae81311f478 - --- traceback not available - --- traceback not available diff --git a/devtools/crest_environment.yml b/devtools/crest_environment.yml new file mode 100644 index 0000000000..2291e72d37 --- /dev/null +++ b/devtools/crest_environment.yml @@ -0,0 +1,6 @@ +name: crest_env +channels: + - conda-forge +dependencies: + - python>=3.7 + - crest=2.12 diff --git a/devtools/install_all.sh b/devtools/install_all.sh index c958fdd548..c9de207ef7 100644 --- a/devtools/install_all.sh +++ b/devtools/install_all.sh @@ -26,6 +26,8 @@ run_devtool () { bash "$DEVTOOLS_DIR/$1" "${@:2}"; } SKIP_CLEAN=false SKIP_EXT=false SKIP_ARC=false +SKIP_RMG=false +ARC_INSTALLED=false RMG_ARGS=() ARC_ARGS=() EXT_ARGS=() @@ -36,6 +38,7 @@ while [[ $# -gt 0 ]]; do --no-clean) SKIP_CLEAN=true ;; --no-ext) SKIP_EXT=true ;; --no-arc) SKIP_ARC=true ;; + --no-rmg) SKIP_RMG=true ;; --rmg-*) RMG_ARGS+=("--${1#--rmg-}") ;; --arc-*) ARC_ARGS+=("--${1#--arc-}") ;; --ext-*) EXT_ARGS+=("--${1#--ext-}") ;; @@ -44,6 +47,7 @@ while [[ $# -gt 0 ]]; do Usage: $0 [global-flags] [--rmg-xxx] [--arc-yyy] [--ext-zzz] --no-clean Skip micromamba/conda cache cleanup --no-ext Skip external tools (AutoTST, KinBot, …) + --no-rmg Skip RMG-Py entirely --rmg-path Forward '--path' to RMG installer --rmg-pip Forward '--pip' to RMG installer ... @@ -67,16 +71,15 @@ echo " EXT sub-flags : ${EXT_ARGS[*]:-(none)}" echo ">>> Beginning full ARC external repo installation…" pushd . >/dev/null -# 1) RMG -echo "=== Installing RMG ===" -run_devtool install_rmg.sh "${RMG_ARGS[@]}" - - - # 2) PyRDL - echo "=== Installing PyRDL ===" - bash devtools/install_pyrdl.sh +# 1) RMG (optional) +if [[ $SKIP_RMG == false ]]; then + echo "=== Installing RMG ===" + run_devtool install_rmg.sh "${RMG_ARGS[@]}" +else + echo "ℹ️ --no-rmg flag set. Skipping RMG installation." +fi -# 3) ARC itself (skip env creation in CI or if user requests it) +# 2) ARC itself (skip env creation in CI or if user requests it) if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then if [[ $SKIP_CLEAN == false ]]; then echo "=== Cleaning up old ARC build artifacts ===" @@ -88,10 +91,23 @@ if [[ "${CI:-false}" != "true" && "${SKIP_ARC:-false}" != "true" ]]; then echo "=== Installing ARC ===" run_devtool install_arc.sh "${ARC_ARGS[@]}" + ARC_INSTALLED=true else + ARC_INSTALLED=false echo ":information_source: CI detected or --no-arc flag set. Skip cleaning ARC installation." fi +# 3) PyRDL (needs arc_env, but not ARC install) +if [[ "${CI:-false}" == "true" ]]; then + echo "=== Installing PyRDL (CI) ===" + bash devtools/install_pyrdl.sh +elif [[ $ARC_INSTALLED == true ]]; then + echo "=== Installing PyRDL ===" + bash devtools/install_pyrdl.sh +else + echo "ℹ️ Skipping PyRDL install because ARC installation was skipped." +fi + if [[ $SKIP_EXT == false ]]; then # map of friendly names → installer scripts declare -A EXT_INSTALLERS=( @@ -100,6 +116,7 @@ if [[ $SKIP_EXT == false ]]; then [KinBot]=install_kinbot.sh [OpenBabel]=install_ob.sh [xtb]=install_xtb.sh + [CREST]=install_crest.sh [Sella]=install_sella.sh [TorchANI]=install_torchani.sh ) diff --git a/devtools/install_autotst.sh b/devtools/install_autotst.sh index 5e3bc35288..e71e42d035 100644 --- a/devtools/install_autotst.sh +++ b/devtools/install_autotst.sh @@ -31,6 +31,8 @@ done # where "$(pwd)" is the path to the AutoTST repository. write_hook () { local env="$1" repo_path="$2" # repo_path="$(pwd)" in AutoTST + local repo_path_escaped + repo_path_escaped=$(printf '%q' "$repo_path") $COMMAND_PKG env list | awk '{print $1}' | grep -qx "$env" || return 0 # env prefix @@ -50,16 +52,37 @@ write_hook () { # --- activation -------------------------------------------------------- cat >"$act" <>"$act" <<'EOF' +# Remove RMG-Py from PATH/PYTHONPATH to avoid clashes while AutoTST is active. +if [[ -n "${RMG_PY_PATH:-}" ]]; then + export PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + export PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" +fi +EOF + fi + + cat >>"$act" <<'EOF' case ":\$PYTHONPATH:" in *":\$AUTOTST_ROOT:"*) ;; \ *) export PYTHONPATH="\$AUTOTST_ROOT:\${PYTHONPATH:-}" ;; esac EOF # --- de-activation ----------------------------------------------------- cat >"$deact" <<'EOF' -_strip () { local n=":$1:"; local s=":$2:"; echo "${s//$n/:}" | sed 's/^://;s/:$//'; } -export PYTHONPATH=$(_strip "$AUTOTST_ROOT" ":${PYTHONPATH:-}:") -unset AUTOTST_ROOT +export PATH="${AUTOTST_OLD_PATH:-$PATH}" +if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" +else + unset PYTHONPATH +fi +unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH EOF echo "🔗 AutoTST hook refreshed in $env" } @@ -115,12 +138,53 @@ fi if [[ $MODE == "path" ]]; then - AUTO_PATH_LINE="export PYTHONPATH=\"\$PYTHONPATH:$(pwd)\"" - if ! grep -Fqx "$AUTO_PATH_LINE" ~/.bashrc; then - echo "$AUTO_PATH_LINE" >> ~/.bashrc - echo "✔️ Added AutoTST path to ~/.bashrc" + HOOK_SENTINEL="# AutoTST path-mode hook" + if ! grep -Fqx "$HOOK_SENTINEL" ~/.bashrc; then + cat <<'EOF' >> ~/.bashrc +# AutoTST path-mode hook +_strip_path () { + local needle=":$1:" + local haystack=":$2:" + echo "${haystack//$needle/:}" | sed 's/^://;s/:$//' +} + +autotst_on () { + export AUTOTST_ROOT="__AUTOTST_PATH__" + export AUTOTST_OLD_PATH="$PATH" + export AUTOTST_OLD_PYTHONPATH="${PYTHONPATH:-}" + if [[ -n "${RMG_PY_PATH:-}" ]]; then + PATH="$(_strip_path "$RMG_PY_PATH" "$PATH")" + PYTHONPATH="$(_strip_path "$RMG_PY_PATH" "${PYTHONPATH:-}")" + fi + + case ":$PYTHONPATH:" in *":$AUTOTST_ROOT:"*) ;; \ + *) PYTHONPATH="$AUTOTST_ROOT:${PYTHONPATH:-}" ;; esac + export PATH PYTHONPATH +} + +autotst_off () { + export PATH="${AUTOTST_OLD_PATH:-$PATH}" + if [[ -n "${AUTOTST_OLD_PYTHONPATH+x}" ]]; then + export PYTHONPATH="$AUTOTST_OLD_PYTHONPATH" + else + unset PYTHONPATH + fi + unset AUTOTST_ROOT AUTOTST_OLD_PATH AUTOTST_OLD_PYTHONPATH +} + +# Enable AutoTST by default in new shells and keep RMG-Py out of the way. +autotst_on +EOF + # replace placeholder with actual path (portable across GNU/BSD sed) + AUTOTST_ESCAPED_PATH="$(printf '%q' "$(pwd)" | sed 's#/#\\\\/#g')" + if sed --version >/dev/null 2>&1; then + sed -i "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + else + sed -i '' "s#__AUTOTST_PATH__#${AUTOTST_ESCAPED_PATH}#" ~/.bashrc + fi + echo "✔️ Added AutoTST path-mode hook to ~/.bashrc" else - echo "ℹ️ AutoTST path already exists in ~/.bashrc" + echo "ℹ️ AutoTST path-mode hook already exists in ~/.bashrc" fi elif [[ $MODE == "conda" ]]; then write_hook tst_env "$(pwd)" diff --git a/devtools/install_crest.sh b/devtools/install_crest.sh new file mode 100644 index 0000000000..1086ec9db2 --- /dev/null +++ b/devtools/install_crest.sh @@ -0,0 +1,64 @@ +#!/bin/bash -l +set -eo pipefail + +if command -v micromamba &> /dev/null; then + echo "✔️ Micromamba is installed." + COMMAND_PKG=micromamba +elif command -v mamba &> /dev/null; then + echo "✔️ Mamba is installed." + COMMAND_PKG=mamba +elif command -v conda &> /dev/null; then + echo "✔️ Conda is installed." + COMMAND_PKG=conda +else + echo "❌ Micromamba, Mamba, or Conda is required. Please install one." + exit 1 +fi + +if [ "$COMMAND_PKG" = "micromamba" ]; then + eval "$(micromamba shell hook --shell=bash)" +else + BASE=$(conda info --base) + . "$BASE/etc/profile.d/conda.sh" +fi + +ENV_FILE="devtools/crest_environment.yml" + +if [ ! -f "$ENV_FILE" ]; then + echo "❌ File not found: $ENV_FILE" + exit 1 +fi + +if $COMMAND_PKG env list | grep -q '^crest_env\s'; then + echo ">>> Updating existing crest_env..." + $COMMAND_PKG env update -n crest_env -f "$ENV_FILE" --prune +else + echo ">>> Creating new crest_env..." + $COMMAND_PKG env create -n crest_env -f "$ENV_FILE" -y +fi + +echo ">>> Checking CREST installation..." + +if [ "$COMMAND_PKG" = "micromamba" ]; then + CREST_RUNNER="micromamba run -n crest_env" + CREST_LISTER="micromamba list -n crest_env" +else + CREST_RUNNER="conda run -n crest_env" + CREST_LISTER="conda list -n crest_env" +fi + +if $CREST_RUNNER crest --version &> /dev/null; then + version_output=$($CREST_RUNNER crest --version 2>&1) + echo "$version_output" + installed_version=$(printf '%s' "$version_output" | tr '\n' ' ' | sed -n 's/.*Version[[:space:]]\+\([0-9.][0-9.]*\).*/\1/p') + if [ "$installed_version" != "2.12" ]; then + echo "❌ CREST version mismatch (expected 2.12)." + exit 1 + fi + echo "✔️ CREST 2.12 is successfully installed." +else + echo "❌ CREST is not found in PATH. Please check the environment." + exit 1 +fi + +echo "✅ Done installing CREST (crest_env)." diff --git a/devtools/install_gcn.sh b/devtools/install_gcn.sh index 8f83a2cda1..5273353d77 100644 --- a/devtools/install_gcn.sh +++ b/devtools/install_gcn.sh @@ -93,12 +93,12 @@ write_hook() { # env_name repo_path rm -f "$act" "$deact" # --- activation hook ----------------------------------------------------- - cat <<'ACTHOOK' >"$act" + cat <"$act" # TS-GCN hook – $(date +%F) export TSGCN_ROOT="$repo" -case ":$PYTHONPATH:" in - *":$TSGCN_ROOT:") ;; \ - *) export PYTHONPATH="$TSGCN_ROOT:\${PYTHONPATH:-}" ;; +case ":\$PYTHONPATH:" in + *":\$TSGCN_ROOT:") ;; \ + *) export PYTHONPATH="\$TSGCN_ROOT:\${PYTHONPATH:-}" ;; esac ACTHOOK @@ -182,46 +182,43 @@ CORE_PKGS=( # ── inline env creation & unified PyTorch install -------------------------- if $COMMAND_PKG env list | awk '{print $1}' | grep -qx ts_gcn; then - $COMMAND_PKG env update -n ts_gcn \ + $COMMAND_PKG install -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - --prune -y + --yes else - $COMMAND_PKG env create -n ts_gcn \ + $COMMAND_PKG create -n ts_gcn \ -c schrodinger -c conda-forge \ --channel-priority flexible \ "${CORE_PKGS[@]}" \ - -y + --yes fi - # 2) activate it - we set +u to avoid printing variable names - # that are not set yet - set +u; $COMMAND_PKG activate ts_gcn; set -u - - # 3) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) - WHEEL=https://download.pytorch.org/whl/torch_stable.html - if [[ $CUDA_VERSION == cpu ]]; then -pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL - else - pip install torch==1.7.1+${CUDA_VERSION} \ - torchvision==0.8.2+${CUDA_VERSION} \ - torchaudio==0.7.2+${CUDA_VERSION} \ - -f $WHEEL - fi - # for PyG wheels use the official PyG index—with a real '+' in the URL - TORCH_VER=1.7.1 - WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" - - # install ONLY the prebuilt binaries, never fall back to source - pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter - pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse - pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster - pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv - - # finally the meta‐package (this one can install from PyPI) - pip install torch-geometric - echo "✅ ts_gcn environment ready" +# 2) pip‐install exactly the CPU or CUDA wheels (no ROCm on that index) +PIP_RUN=("$COMMAND_PKG" run -n ts_gcn) +WHEEL=https://download.pytorch.org/whl/torch_stable.html +if [[ $CUDA_VERSION == cpu ]]; then + "${PIP_RUN[@]}" pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f $WHEEL +else + "${PIP_RUN[@]}" pip install torch==1.7.1+${CUDA_VERSION} \ + torchvision==0.8.2+${CUDA_VERSION} \ + torchaudio==0.7.2+${CUDA_VERSION} \ + -f $WHEEL +fi +# for PyG wheels use the official PyG index—with a real '+' in the URL +TORCH_VER=1.7.1 +WHEEL_URL="https://pytorch-geometric.com/whl/torch-${TORCH_VER}+${CUDA_VERSION}.html" + +# install ONLY the prebuilt binaries, never fall back to source +"${PIP_RUN[@]}" pip install torch-scatter -f "$WHEEL_URL" --only-binary torch-scatter +"${PIP_RUN[@]}" pip install torch-sparse -f "$WHEEL_URL" --only-binary torch-sparse +"${PIP_RUN[@]}" pip install torch-cluster -f "$WHEEL_URL" --only-binary torch-cluster +"${PIP_RUN[@]}" pip install torch-spline-conv -f "$WHEEL_URL" --only-binary torch-spline-conv + +# finally the meta‐package (this one can install from PyPI) +"${PIP_RUN[@]}" pip install torch-geometric +echo "✅ ts_gcn environment ready" # ── write hooks into conda envs if required ------------------------------- if [[ $MODE == conda ]]; then diff --git a/devtools/install_pyrdl.sh b/devtools/install_pyrdl.sh index 529d9d5dc3..edcb5ed9da 100644 --- a/devtools/install_pyrdl.sh +++ b/devtools/install_pyrdl.sh @@ -51,8 +51,8 @@ fi # Ensure CMake is installed in the environment if ! command -v cmake &> /dev/null; then - echo "Installing CMake..." - "$COMMAND_PKG" install -y cmake + echo "Installing CMake into arc_env..." + "$COMMAND_PKG" install -n arc_env -c conda-forge -y cmake fi # Clone and build RingDecomposerLib diff --git a/devtools/install_torchani.sh b/devtools/install_torchani.sh index 5410e88658..992031d014 100644 --- a/devtools/install_torchani.sh +++ b/devtools/install_torchani.sh @@ -2,9 +2,10 @@ set -eo pipefail # Enable tracing of each command, but tee it to a logfile +LOGFILE="tani_env_setup.log" exec 3>&1 4>&2 trap 'exec 2>&4 1>&3' EXIT -exec 1> >(tee .log) 2>&1 +exec 1> >(tee "$LOGFILE") 2>&1 set -x echo ">>> Starting TANI environment setup at $(date)" @@ -53,7 +54,7 @@ fi echo ">>> Creating conda env from $ENV_YAML (name=$ENV_NAME)" if ! $COMMAND_PKG env create -n "$ENV_NAME" -f "$ENV_YAML" -v; then echo "❌ Environment creation failed. Dumping last 200 lines of log:" - tail -n 200 tani_env_setup.log + tail -n 200 "$LOGFILE" echo "---- Disk usage at failure ----" df -h . exit 1 diff --git a/docs/source/TS_search.rst b/docs/source/TS_search.rst index 73513c9afb..db205dc61f 100644 --- a/docs/source/TS_search.rst +++ b/docs/source/TS_search.rst @@ -47,6 +47,9 @@ Outputs and validation """""""""""""""""""""" Validated TS results are reported in the project output (log files and generated artifacts), together with the supporting calculations (optimization, frequency, and IRC). +ARC does not require TS geometries to be isomorphic with a stored 2D adjacency list, since a TS does not have a +single strict graph representation. Instead, TS validation relies on TS-specific checks such as the imaginary +frequency, normal mode displacement analysis, IRC results, and energetic consistency. Reference """"""""" @@ -54,4 +57,90 @@ A detailed description of the methodology, design choices, and validation benchm L. Fahoum, A. Grinberg Dana, *“Automated Reaction Transition State Search for Neutral Hydrolysis Reactions”*, Digital Discovery, 2026. +CREST +^^^^^ + +CREST is an external conformational sampling tool used by ARC as a TS-search wrapper stage. +In ARC's current flow, CREST is applied to TS seeds generated by base TS search methods and uses +family-specific constraints from ARC. + +Current ARC family support for CREST: + +- ``H_Abstraction`` only (RMG family reference: + `H_Abstraction `_). + +External references: + +- `CREST documentation `_ +- `CREST constrained sampling example `_ + +Wrapper Extension Guide +""""""""""""""""""""""" + +Use this guide when extending CREST-based TS workflows in ARC (for example, adding hydrolysis support to CREST, +or allowing CREST to wrap a new TS seed source adapter). + +ARC uses a neutral wrapper hub API for TS seed generation and wrapper-specific constraints: + +- ``arc.job.adapters.ts.seed_hub.get_ts_seeds(...)`` +- ``arc.job.adapters.ts.seed_hub.get_wrapper_constraints(...)`` + +Current status +"""""""""""""" + +- ``CrestAdapter`` requests seeds using ``base_adapter='heuristics'``. +- ``CrestAdapter`` requests constraints using ``wrapper='crest'``. +- CREST constraints are currently implemented for ``H_Abstraction`` only. +- Hydrolysis seeds can be generated by heuristics, but CREST constraints for hydrolysis are not implemented yet. + +Seed schema contract +"""""""""""""""""""" + +``get_ts_seeds(...)`` returns a list of seed dictionaries with the following fields: + +- ``xyz``: Cartesian coordinates dictionary. +- ``family``: Reaction family associated with the seed. +- ``method``: Method label for provenance. +- ``source_adapter``: TS-search adapter id that generated the seed. +- ``metadata``: Optional adapter-specific metadata dictionary. + +Extension instructions: Add a new family to CREST +""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Update ``get_ts_seeds(...)`` logic in ``arc/job/adapters/ts/seed_hub.py`` only if the seed generation path changes. +2. Add family-specific CREST constraints in ``_get_crest_constraints(...)`` (or family helper it calls) in + ``arc/job/adapters/ts/seed_hub.py``. +3. Add/update tests in ``arc/job/adapters/ts/heuristics_test.py`` (``TestHeuristicsHub``). +4. Update ``ts_adapters_by_rmg_family`` mapping if CREST should be enabled for that family. + +Extension instructions: Let CREST wrap a new TS seed adapter +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +1. Add a ``base_adapter`` branch in ``get_ts_seeds(...)``. +2. Ensure the returned seed objects satisfy the seed schema contract. +3. Reuse ``get_wrapper_constraints(wrapper='crest', ...)`` with those seeds. +4. Add tests for the new adapter branch and constraints compatibility. + +Minimal usage pattern +""""""""""""""""""""" + +.. code-block:: python + + from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints + + seeds = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=30, + ) + for seed in seeds: + crest_constraints = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=seed, + ) + if crest_constraints is None: + continue + # run CREST with crest_constraints["A"], crest_constraints["H"], crest_constraints["B"] + .. include:: links.txt diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 091fa90b41..8393a1452d 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -481,14 +481,18 @@ __ Truhlar1_ Isomorphism checks ^^^^^^^^^^^^^^^^^^ -When a species is defined using a 2D graph (i.e., SMILES, AdjList, or InChI), an isomorphism check +For non-TS species defined using a 2D graph (i.e., SMILES, AdjList, or InChI), an isomorphism check is performed on the optimized geometry (all conformers and final optimization). -If the molecule perceived from the 3D coordinate is not isomorphic +If the molecule perceived from the 3D coordinates is not isomorphic with the input 2D graph, ARC will not spawn any additional jobs for the species, and will not use it further -(for thermo and/or rates calculations). However, sometimes the perception algorithm doesn't work as expected (e.g., -issues with charged species and triplets are known). To continue spawning jobs for all species in an ARC +(for thermo and/or rates calculations). However, sometimes the perception algorithm does not work as expected (e.g., +issues with charged species and triplets are known). To continue spawning jobs for all non-TS species in an ARC project, pass ``True`` to the ``allow_nonisomorphic_2d`` argument (it is ``False`` by default). +Transition states are handled differently. ARC does not enforce 2D-graph isomorphism for TS species, since TS +connectivity and bond orders are not uniquely defined. TS validation is instead based on TS-specific criteria such as +the imaginary frequency, normal mode displacement checks, IRC calculations, and energetic consistency. + .. _directory: diff --git a/functional/restart_test.py b/functional/restart_test.py index 35594910b8..5be2f476c2 100644 --- a/functional/restart_test.py +++ b/functional/restart_test.py @@ -12,7 +12,7 @@ from arc.molecule.molecule import Molecule -from arc.common import ARC_PATH, read_yaml_file +from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file from arc.main import ARC @@ -42,7 +42,7 @@ def test_restart_thermo(self): Test restarting ARC through the ARC class in main.py via the input_dict argument of the API Rather than through ARC.py. Check that all files are in place and the log file content. """ - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '1_restart_thermo') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '1_restart_thermo') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('arc_project_for_testing_delete_after_usage_restart_thermo') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -139,7 +139,7 @@ def test_restart_thermo(self): def test_restart_rate_1(self): """Test restarting ARC and attaining a reaction rate coefficient""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '2_restart_rate') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '2_restart_rate') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_1') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -164,7 +164,7 @@ def test_restart_rate_2(self): """Test restarting ARC and attaining a reaction rate coefficient""" project = _project_name('arc_project_for_testing_delete_after_usage_restart_rate_2') project_directory = os.path.join(ARC_PATH, 'Projects', project) - base_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '5_TS1') + base_path = os.path.join(ARC_TESTING_PATH, 'restart', '5_TS1') restart_path = os.path.join(base_path, 'restart.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['output']['TS0']['paths']['freq'] = os.path.join(ARC_PATH, input_dict['output']['TS0']['paths']['freq']) @@ -189,7 +189,7 @@ def test_restart_rate_2(self): def test_restart_bde (self): """Test restarting ARC and attaining a BDE for anilino_radical.""" - restart_dir = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '3_restart_bde') + restart_dir = os.path.join(ARC_TESTING_PATH, 'restart', '3_restart_bde') restart_path = os.path.join(restart_dir, 'restart.yml') project = _project_name('test_restart_bde') project_directory = os.path.join(ARC_PATH, 'Projects', project) @@ -208,7 +208,7 @@ def test_restart_bde (self): def test_globalize_paths(self): """Test modifying a YAML file's contents to correct absolute file paths""" - project_directory = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths') + project_directory = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths') restart_path = os.path.join(project_directory, 'restart_paths.yml') input_dict = read_yaml_file(path=restart_path, project_directory=project_directory) input_dict['project_directory'] = project_directory @@ -235,16 +235,16 @@ def tearDownClass(cls): project_directory = os.path.join(ARC_PATH, 'Projects', project) shutil.rmtree(project_directory, ignore_errors=True) - shutil.rmtree(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', + shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', 'log_and_restart_archive'), ignore_errors=True) for file_name in ['arc.log', 'restart_paths_globalized.yml']: - file_path = os.path.join(ARC_PATH, 'arc', 'testing', 'restart', '4_globalized_paths', file_name) + file_path = os.path.join(ARC_TESTING_PATH, 'restart', '4_globalized_paths', file_name) if os.path.isfile(file_path): os.remove(file_path) file_paths = [os.path.join(ARC_PATH, 'functional', 'nul'), os.path.join(ARC_PATH, 'functional', 'run.out')] project_names = ['1_restart_thermo', '2_restart_rate', '3_restart_bde', '5_TS1'] for project_name in project_names: - file_paths.append(os.path.join(ARC_PATH, 'arc', 'testing', 'restart', project_name, 'restart_globalized.yml')) + file_paths.append(os.path.join(ARC_TESTING_PATH, 'restart', project_name, 'restart_globalized.yml')) for file_path in file_paths: if os.path.isfile(file_path): os.remove(file_path)