From b3c47336b743bbc331625f700a00dc1ba8229678 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:43:14 -0700 Subject: [PATCH 01/83] refactor(core): Change potentials module visibility to crate-level --- crates/scream-core/src/core/forcefield/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/core/forcefield/mod.rs b/crates/scream-core/src/core/forcefield/mod.rs index ec060ce8..e1c35667 100644 --- a/crates/scream-core/src/core/forcefield/mod.rs +++ b/crates/scream-core/src/core/forcefield/mod.rs @@ -1,6 +1,6 @@ pub mod energy; pub mod parameterization; pub mod params; -pub mod potentials; +pub(crate) mod potentials; pub mod scoring; pub mod term; From ae0b5e1d0c725754086fe20516a1c646c2e296d2 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:43:29 -0700 Subject: [PATCH 02/83] refactor(core): Change sorting module visibility to crate-level --- crates/scream-core/src/core/io/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/core/io/mod.rs b/crates/scream-core/src/core/io/mod.rs index 784d1c6b..84e5f2b6 100644 --- a/crates/scream-core/src/core/io/mod.rs +++ b/crates/scream-core/src/core/io/mod.rs @@ -1,3 +1,3 @@ pub mod bgf; -pub mod sorting; +pub(crate) mod sorting; pub mod traits; From a105e1b6dce745b6fe1dc9debbd55d3c9e93215f Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:48:40 -0700 Subject: [PATCH 03/83] refactor(engine): Change OptimizationState visibility to crate-level --- crates/scream-core/src/engine/state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/state.rs b/crates/scream-core/src/engine/state.rs index f984c31a..427293d8 100644 --- a/crates/scream-core/src/engine/state.rs +++ b/crates/scream-core/src/engine/state.rs @@ -43,7 +43,7 @@ impl Ord for Solution { } #[derive(Debug, Clone)] -pub struct OptimizationState { +pub(crate) struct OptimizationState { pub working_state: SolutionState, pub current_optimization_score: f64, solutions: BinaryHeap, From abf47a4176e92f09f9d2b47fce46bf5155bc42d6 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:50:02 -0700 Subject: [PATCH 04/83] refactor(engine): Change ELCache visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index cba14a93..d080f1ba 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -1,4 +1,4 @@ -pub mod cache; +pub(crate) mod cache; pub mod config; pub mod context; pub mod energy_grid; From 4544a21a45b773ed976e29e530ce92c6e63faa74 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:50:57 -0700 Subject: [PATCH 05/83] refactor(engine): Change context module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index d080f1ba..df769569 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -1,6 +1,6 @@ pub(crate) mod cache; pub mod config; -pub mod context; +pub(crate) mod context; pub mod energy_grid; pub mod error; pub mod placement; From d0d56095562069ef785c4a5f53e3d79774d1e511 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:51:34 -0700 Subject: [PATCH 06/83] refactor(engine): Change energy_grid module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index df769569..2da436ea 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -1,7 +1,7 @@ pub(crate) mod cache; pub mod config; pub(crate) mod context; -pub mod energy_grid; +pub(crate) mod energy_grid; pub mod error; pub mod placement; pub mod progress; From 60e17bed89c94f787ca71ccf553c7471383d98f5 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:51:42 -0700 Subject: [PATCH 07/83] refactor(engine): Change transaction module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index 2da436ea..01b62806 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -7,5 +7,5 @@ pub mod placement; pub mod progress; pub mod state; pub mod tasks; -pub mod transaction; +pub(crate) mod transaction; pub mod utils; From 5db7a2627f230a867a9d93d2f59d6b9990687c3a Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:51:49 -0700 Subject: [PATCH 08/83] refactor(engine): Change placement module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index 01b62806..bd41104d 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -3,7 +3,7 @@ pub mod config; pub(crate) mod context; pub(crate) mod energy_grid; pub mod error; -pub mod placement; +pub(crate) mod placement; pub mod progress; pub mod state; pub mod tasks; From 51b1e6660aedb039de4b5141d7c9e7e9e0e79076 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:52:20 -0700 Subject: [PATCH 09/83] refactor(engine): Change tasks module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index bd41104d..537ddb56 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -6,6 +6,6 @@ pub mod error; pub(crate) mod placement; pub mod progress; pub mod state; -pub mod tasks; +pub(crate) mod tasks; pub(crate) mod transaction; pub mod utils; From 617508f763785bffab404549cd6219ed6bb58061 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 11:52:29 -0700 Subject: [PATCH 10/83] refactor(engine): Change utils module visibility to crate-level --- crates/scream-core/src/engine/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index 537ddb56..04dcd2ce 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -8,4 +8,4 @@ pub mod progress; pub mod state; pub(crate) mod tasks; pub(crate) mod transaction; -pub mod utils; +pub(crate) mod utils; From bb5a931ebe641bab8b64424f7ff72c8ed4568f27 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 12:28:40 -0700 Subject: [PATCH 11/83] refactor(core): Change energy module visibility to crate-level --- crates/scream-core/src/core/forcefield/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/core/forcefield/mod.rs b/crates/scream-core/src/core/forcefield/mod.rs index e1c35667..640f404c 100644 --- a/crates/scream-core/src/core/forcefield/mod.rs +++ b/crates/scream-core/src/core/forcefield/mod.rs @@ -1,4 +1,4 @@ -pub mod energy; +pub(crate) mod energy; pub mod parameterization; pub mod params; pub(crate) mod potentials; From e97d7ac014b8159242bcafa61302d8271dafa5db Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:37:22 -0700 Subject: [PATCH 12/83] docs(core): Add documentation and clarify atom role classifications --- crates/scream-core/src/core/models/atom.rs | 100 ++++++++++++++++----- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/crates/scream-core/src/core/models/atom.rs b/crates/scream-core/src/core/models/atom.rs index be24496b..3b019a00 100644 --- a/crates/scream-core/src/core/models/atom.rs +++ b/crates/scream-core/src/core/models/atom.rs @@ -2,51 +2,94 @@ use super::ids::ResidueId; use nalgebra::Point3; use std::str::FromStr; +/// Represents the role or classification of an atom within a molecular structure. +/// +/// This enum categorizes atoms based on their functional role in the molecule, +/// which is useful for algorithms that need to distinguish between different +/// types of atoms (e.g., backbone vs. sidechain) for computational efficiency +/// or specific force field applications. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] pub enum AtomRole { - Backbone, // Backbone atom (e.g., C, N, O) - Sidechain, // Sidechain atom (e.g., CH3, OH) - Ligand, // Ligand atom (e.g., in a small molecule) - Water, // Water molecule atom (e.g., H2O) + /// Backbone atom, typically part of the main chain in proteins (e.g., C, N, O). + Backbone, + /// Sidechain atom, part of the side groups attached to the backbone. + Sidechain, + /// Ligand atom, associated with small molecules or ligands bound to the structure. + Ligand, + /// Water molecule atom, for solvent molecules in the system. + Water, + /// Unknown or unclassified atom role. #[default] - Other, // Unknown or unclassified atom + Other, } +/// Caches van der Waals parameters for efficient force field computations. +/// +/// This enum stores pre-computed parameters for different van der Waals potentials, +/// allowing for faster energy calculations by avoiding repeated lookups. +/// It supports common potential forms used in molecular simulations. #[derive(Debug, Clone, Copy, PartialEq)] pub enum CachedVdwParam { + /// Lennard-Jones potential parameters. LennardJones { + /// The van der Waals radius in Angstroms. radius: f64, + /// The well depth parameter (epsilon) in kcal/mol. well_depth: f64, }, + /// Buckingham potential parameters. Buckingham { + /// The van der Waals radius in Angstroms. radius: f64, + /// The well depth parameter in kcal/mol. well_depth: f64, + /// Scaling factor for the exponential term. scale: f64, }, + /// No cached parameters available. None, } +/// Represents an atom in a molecular structure with its properties and parameters. +/// +/// This struct encapsulates all the necessary information about an atom, +/// including its identity, physicochemical properties, and SCREAM-specific +/// parameters used in side-chain placement algorithms. It is designed for +/// high-performance computations in protein modeling. #[derive(Debug, Clone, PartialEq)] pub struct Atom { - // --- Identity & Topology --- - pub name: String, // Atom name (e.g., "CA", "N", "O") - pub residue_id: ResidueId, // ID of the parent residue + /// The name of the atom (e.g., "CA", "N", "O"). + pub name: String, + /// The ID of the parent residue this atom belongs to. + pub residue_id: ResidueId, + /// The role or classification of the atom in the molecular structure. pub role: AtomRole, - - // --- Physicochemical Properties --- - pub force_field_type: String, // Force field atom type (e.g., "C.3", "N.2") - pub partial_charge: f64, // Partial atomic charge - pub position: Point3, // 3D coordinates - - // --- SCREAM Algorithm Specific Parameters --- - pub delta: f64, // "Delta" value for the flat-bottom potential - - // --- Cached Force Field Parameters for Performance --- - pub vdw_param: CachedVdwParam, // Cached van der Waals parameters - pub hbond_type_id: i8, // Hydrogen bond type identifier (-1: None, 0: Donor Hydrogen, >0: Acceptor) + /// The force field atom type (e.g., "C.3", "N.2"). + pub force_field_type: String, + /// The partial atomic charge in elementary charge units. + pub partial_charge: f64, + /// The 3D coordinates of the atom in Angstroms. + pub position: Point3, + /// The "Delta" value for the flat-bottom potential in SCREAM algorithm. + pub delta: f64, + /// Cached van der Waals parameters for performance optimization. + pub vdw_param: CachedVdwParam, + /// Hydrogen bond type identifier (-1: None, 0: Donor Hydrogen, >0: Acceptor). + pub hbond_type_id: i8, } impl Atom { + /// Creates a new `Atom` with default values for most fields. + /// + /// This constructor initializes an atom with the provided name, residue ID, + /// and position. Other fields are set to their default values and can be + /// modified afterward as needed. + /// + /// # Arguments + /// + /// * `name` - The name of the atom. + /// * `residue_id` - The ID of the residue this atom belongs to. + /// * `position` - The 3D coordinates of the atom. pub fn new(name: &str, residue_id: ResidueId, position: Point3) -> Self { Self { name: name.to_string(), @@ -65,6 +108,23 @@ impl Atom { impl FromStr for AtomRole { type Err = (); + /// Parses a string into an `AtomRole`. + /// + /// This implementation allows converting string representations of atom roles + /// into the corresponding enum variants. It is case-insensitive and supports + /// common variations (e.g., "side-chain" or "side_chain"). + /// + /// # Arguments + /// + /// * `s` - The string to parse into an `AtomRole`. + /// + /// # Return + /// + /// Returns the parsed `AtomRole` if successful. + /// + /// # Errors + /// + /// Returns `()` if the input string does not match any known atom role. fn from_str(s: &str) -> Result { match s.to_ascii_lowercase().as_str() { "backbone" => Ok(AtomRole::Backbone), From fd464a92fac4f2198bf88373c10d0eb15068ec1a Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:37:46 -0700 Subject: [PATCH 13/83] docs(core): Add documentation for ResidueType and Residue structs --- crates/scream-core/src/core/models/residue.rs | 248 +++++++++++++++--- 1 file changed, 215 insertions(+), 33 deletions(-) diff --git a/crates/scream-core/src/core/models/residue.rs b/crates/scream-core/src/core/models/residue.rs index 33be03ab..88aa9bb3 100644 --- a/crates/scream-core/src/core/models/residue.rs +++ b/crates/scream-core/src/core/models/residue.rs @@ -6,49 +6,87 @@ use std::fmt; use std::str::FromStr; use thiserror::Error; +/// Represents the type of an amino acid residue in a protein structure. +/// +/// This enum defines standard amino acid types, including their protonation states +/// for histidine variants, used in molecular modeling and simulations. +/// Each variant corresponds to a three-letter code commonly used in PDB files. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum ResidueType { - // --- Aliphatic, Nonpolar --- - Alanine, // Alanine (ALA) - Glycine, // Glycine (GLY) - Isoleucine, // Isoleucine (ILE) - Leucine, // Leucine (LEU) - Proline, // Proline (PRO) - Valine, // Valine (VAL) - - // --- Aromatic --- - Phenylalanine, // Phenylalanine (PHE) - Tryptophan, // Tryptophan (TRP) - Tyrosine, // Tyrosine (TYR) - - // --- Polar, Uncharged --- - Asparagine, // Asparagine (ASN) - Cysteine, // Cysteine (CYS) - Glutamine, // Glutamine (GLN) - Serine, // Serine (SER) - Threonine, // Threonine (THR) - Methionine, // Methionine (MET) - - // --- Positively Charged (Basic) --- - Arginine, // Arginine (ARG) - Lysine, // Lysine (LYS) - - // --- Negatively Charged (Acidic) --- - AsparticAcid, // Aspartic Acid (ASP) - GlutamicAcid, // Glutamic Acid (GLU) - - // --- Special Case: Histidine and its Variants --- - Histidine, // Histidine (HIS) - Typically assumed to be the Epsilon-protonated state (HSE) - HistidineEpsilon, // Epsilon-protonated Histidine (HSE) - An alias for `Histidine` - HistidineProtonated, // Doubly-protonated Histidine (HSP) - The positively charged variant + /// Alanine (ALA). + Alanine, + /// Glycine (GLY). + Glycine, + /// Isoleucine (ILE). + Isoleucine, + /// Leucine (LEU). + Leucine, + /// Proline (PRO). + Proline, + /// Valine (VAL). + Valine, + /// Phenylalanine (PHE). + Phenylalanine, + /// Tryptophan (TRP). + Tryptophan, + /// Tyrosine (TYR). + Tyrosine, + /// Asparagine (ASN). + Asparagine, + /// Cysteine (CYS). + Cysteine, + /// Glutamine (GLN). + Glutamine, + /// Serine (SER). + Serine, + /// Threonine (THR). + Threonine, + /// Methionine (MET). + Methionine, + /// Arginine (ARG). + Arginine, + /// Lysine (LYS). + Lysine, + /// Aspartic Acid (ASP). + AsparticAcid, + /// Glutamic Acid (GLU). + GlutamicAcid, + /// Histidine (HIS), typically epsilon-protonated. + Histidine, + /// Epsilon-protonated Histidine (HSE). + HistidineEpsilon, + /// Doubly-protonated Histidine (HSP). + HistidineProtonated, } +/// Error type for failed parsing of residue type strings. +/// +/// This error is returned when attempting to parse an invalid or unsupported +/// three-letter residue code into a `ResidueType`. #[derive(Debug, Error, PartialEq, Eq)] #[error("Unsupported or unknown three-letter residue code: '{0}'")] pub struct ParseResidueTypeError(pub String); impl FromStr for ResidueType { type Err = ParseResidueTypeError; + + /// Parses a string into a `ResidueType`. + /// + /// This implementation converts a three-letter residue code (case-insensitive) + /// into the corresponding `ResidueType` variant. It supports standard amino acids + /// and histidine variants. + /// + /// # Arguments + /// + /// * `s` - The string to parse, expected to be a three-letter code. + /// + /// # Return + /// + /// Returns the parsed `ResidueType` if the code is recognized. + /// + /// # Errors + /// + /// Returns `ParseResidueTypeError` if the code is invalid or unsupported. fn from_str(s: &str) -> Result { ResidueType::from_str_optional(s) .ok_or_else(|| ParseResidueTypeError(s.trim().to_uppercase())) @@ -56,12 +94,28 @@ impl FromStr for ResidueType { } impl fmt::Display for ResidueType { + /// Formats the `ResidueType` as its three-letter code. + /// + /// This implementation allows `ResidueType` to be displayed as a string + /// using the standard three-letter amino acid codes. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.to_three_letter()) } } impl ResidueType { + /// Parses a three-letter residue code into a `ResidueType`. + /// + /// This is an internal helper function that performs case-insensitive matching + /// of standard three-letter codes to `ResidueType` variants. + /// + /// # Arguments + /// + /// * `s` - The string to parse. + /// + /// # Return + /// + /// Returns `Some(ResidueType)` if the code is recognized, otherwise `None`. fn parse_code(s: &str) -> Option { match s.trim().to_uppercase().as_str() { "ALA" => Some(ResidueType::Alanine), @@ -90,10 +144,30 @@ impl ResidueType { } } + /// Attempts to parse a string into a `ResidueType` without error handling. + /// + /// This method provides a fallible alternative to `FromStr::from_str`, + /// returning `None` for invalid codes instead of an error. + /// + /// # Arguments + /// + /// * `s` - The string to parse. + /// + /// # Return + /// + /// Returns `Some(ResidueType)` if parsing succeeds, otherwise `None`. pub fn from_str_optional(s: &str) -> Option { Self::parse_code(s) } + /// Converts the `ResidueType` to its standard three-letter code. + /// + /// This method returns the canonical three-letter abbreviation for the amino acid, + /// as used in PDB files and molecular databases. + /// + /// # Return + /// + /// A static string slice containing the three-letter code. pub fn to_three_letter(self) -> &'static str { match self { ResidueType::Alanine => "ALA", @@ -122,19 +196,43 @@ impl ResidueType { } } +/// Represents a residue in a molecular structure. +/// +/// This struct encapsulates the properties and atoms of a single residue, +/// providing efficient access to backbone and sidechain atoms through caching. +/// It is used in protein modeling and side-chain placement algorithms. #[derive(Debug, Clone, PartialEq)] pub struct Residue { + /// The sequential number of the residue in its chain. pub residue_number: isize, + /// The name of the residue. pub name: String, + /// The type of the residue, if known. pub residue_type: Option, + /// The ID of the chain this residue belongs to. pub chain_id: ChainId, + /// List of atom IDs belonging to this residue. atoms: Vec, + /// Mapping from atom names to lists of atom IDs for quick lookup. atom_name_map: HashMap>, + /// Cached list of sidechain atom IDs. sidechain_atoms_cache: Vec, + /// Cached list of backbone atom IDs. backbone_atoms_cache: Vec, } impl Residue { + /// Creates a new `Residue` with the specified properties. + /// + /// This constructor initializes a residue with empty atom lists and caches. + /// Atoms can be added later using `add_atom`. + /// + /// # Arguments + /// + /// * `residue_number` - The sequential number of the residue. + /// * `name` - The name of the residue. + /// * `residue_type` - The type of the residue, if known. + /// * `chain_id` - The ID of the chain this residue belongs to. pub(crate) fn new( residue_number: isize, name: &str, @@ -153,11 +251,24 @@ impl Residue { } } + /// Clears the cached lists of backbone and sidechain atoms. + /// + /// This method is called whenever the atom list changes to ensure + /// that cached data remains consistent. fn invalidate_caches(&mut self) { self.sidechain_atoms_cache.clear(); self.backbone_atoms_cache.clear(); } + /// Adds an atom to the residue. + /// + /// This method registers an atom with the given name and ID, updating + /// the internal lists and invalidating caches as necessary. + /// + /// # Arguments + /// + /// * `atom_name` - The name of the atom. + /// * `atom_id` - The ID of the atom to add. pub(crate) fn add_atom(&mut self, atom_name: &str, atom_id: AtomId) { self.atoms.push(atom_id); self.atom_name_map @@ -167,6 +278,15 @@ impl Residue { self.invalidate_caches(); } + /// Removes an atom from the residue. + /// + /// This method removes the specified atom by name and ID, cleaning up + /// the internal data structures and invalidating caches. + /// + /// # Arguments + /// + /// * `atom_name` - The name of the atom to remove. + /// * `atom_id_to_remove` - The ID of the atom to remove. pub(crate) fn remove_atom(&mut self, atom_name: &str, atom_id_to_remove: AtomId) { self.atoms.retain(|&id| id != atom_id_to_remove); @@ -179,10 +299,29 @@ impl Residue { self.invalidate_caches(); } + /// Returns a slice of all atom IDs in the residue. + /// + /// This provides read-only access to the list of atoms belonging to the residue. + /// + /// # Return + /// + /// A slice containing all atom IDs. pub fn atoms(&self) -> &[AtomId] { &self.atoms } + /// Returns a slice of sidechain atom IDs, building the cache if necessary. + /// + /// This method lazily computes and caches the list of sidechain atoms + /// by querying the molecular system for atom roles. + /// + /// # Arguments + /// + /// * `system` - The molecular system containing the atoms. + /// + /// # Return + /// + /// A slice containing sidechain atom IDs. pub fn sidechain_atoms<'a>(&'a mut self, system: &'a MolecularSystem) -> &'a [AtomId] { if self.sidechain_atoms_cache.is_empty() && !self.atoms.is_empty() { self.build_caches(system); @@ -190,6 +329,18 @@ impl Residue { &self.sidechain_atoms_cache } + /// Returns a slice of backbone atom IDs, building the cache if necessary. + /// + /// This method lazily computes and caches the list of backbone atoms + /// by querying the molecular system for atom roles. + /// + /// # Arguments + /// + /// * `system` - The molecular system containing the atoms. + /// + /// # Return + /// + /// A slice containing backbone atom IDs. pub fn backbone_atoms<'a>(&'a mut self, system: &'a MolecularSystem) -> &'a [AtomId] { if self.backbone_atoms_cache.is_empty() && !self.atoms.is_empty() { self.build_caches(system); @@ -197,6 +348,14 @@ impl Residue { &self.backbone_atoms_cache } + /// Builds the caches for backbone and sidechain atoms. + /// + /// This method iterates through all atoms in the residue, determines their roles + /// using the molecular system, and populates the respective caches. + /// + /// # Arguments + /// + /// * `system` - The molecular system for querying atom roles. fn build_caches(&mut self, system: &MolecularSystem) { self.invalidate_caches(); for &atom_id in &self.atoms { @@ -210,10 +369,33 @@ impl Residue { } } + /// Retrieves atom IDs by atom name. + /// + /// This method looks up atoms with the specified name in the residue. + /// + /// # Arguments + /// + /// * `name` - The name of the atom to search for. + /// + /// # Return + /// + /// Returns `Some` slice of atom IDs if found, otherwise `None`. pub fn get_atom_ids_by_name(&self, name: &str) -> Option<&[AtomId]> { self.atom_name_map.get(name).map(|v| v.as_slice()) } + /// Retrieves the first atom ID by atom name. + /// + /// This method returns the first atom with the specified name, useful + /// when there is expected to be only one atom of that name. + /// + /// # Arguments + /// + /// * `name` - The name of the atom to search for. + /// + /// # Return + /// + /// Returns `Some` atom ID if found, otherwise `None`. pub fn get_first_atom_id_by_name(&self, name: &str) -> Option { self.get_atom_ids_by_name(name) .and_then(|ids| ids.first().copied()) From f48de62b2e66345eabe9696617fde6199dffe7c7 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:38:39 -0700 Subject: [PATCH 14/83] docs(core): Add documentation for ChainType and Chain structs --- crates/scream-core/src/core/models/chain.rs | 68 ++++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/crates/scream-core/src/core/models/chain.rs b/crates/scream-core/src/core/models/chain.rs index 855564d5..8f203c2e 100644 --- a/crates/scream-core/src/core/models/chain.rs +++ b/crates/scream-core/src/core/models/chain.rs @@ -3,22 +3,57 @@ use std::fmt; use std::str::FromStr; use thiserror::Error; +/// Represents the type of a molecular chain in a structure. +/// +/// This enum categorizes chains based on their molecular composition, +/// which is useful for algorithms that need to distinguish between +/// different types of molecules in simulations. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum ChainType { + /// Protein chain. Protein, + /// DNA chain. DNA, + /// RNA chain. RNA, + /// Ligand or small molecule chain. Ligand, + /// Water molecule chain. Water, + /// Other or unspecified chain type. Other, } +/// Error type for failed parsing of chain type strings. +/// +/// This error is returned when attempting to parse an invalid +/// string into a `ChainType`. Note that this error is currently +/// not used since unknown strings default to `ChainType::Other`. #[derive(Debug, Error)] #[error("Invalid chain type string")] pub struct ParseChainTypeError; impl FromStr for ChainType { type Err = ParseChainTypeError; + + /// Parses a string into a `ChainType`. + /// + /// This implementation converts string representations of chain types + /// into the corresponding enum variants. It is case-insensitive and + /// defaults to `ChainType::Other` for unknown strings. + /// + /// # Arguments + /// + /// * `s` - The string to parse. + /// + /// # Return + /// + /// Returns the parsed `ChainType`. + /// + /// # Errors + /// + /// This method does not currently return errors; unknown strings + /// are mapped to `ChainType::Other`. fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "protein" => Ok(ChainType::Protein), @@ -32,6 +67,10 @@ impl FromStr for ChainType { } impl fmt::Display for ChainType { + /// Formats the `ChainType` as a human-readable string. + /// + /// This implementation allows `ChainType` to be displayed as a string + /// using capitalized names for each variant. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -48,14 +87,29 @@ impl fmt::Display for ChainType { } } +/// Represents a molecular chain in a structure. +/// +/// This struct encapsulates the properties and residues of a single chain, +/// providing access to its constituent residues in order. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Chain { - pub id: char, // Chain identifier (e.g., 'A', 'B') - pub chain_type: ChainType, // Type of the chain - pub(crate) residues: Vec, // Ordered list of residue IDs belonging to this chain + /// The single-character identifier of the chain (e.g., 'A', 'B'). + pub id: char, + /// The type of the chain. + pub chain_type: ChainType, + /// Ordered list of residue IDs belonging to this chain. + pub(crate) residues: Vec, } impl Chain { + /// Creates a new `Chain` with the specified ID and type. + /// + /// This constructor initializes a chain with an empty list of residues. + /// + /// # Arguments + /// + /// * `id` - The single-character identifier for the chain. + /// * `chain_type` - The type of the chain. pub(crate) fn new(id: char, chain_type: ChainType) -> Self { Self { id, @@ -64,6 +118,14 @@ impl Chain { } } + /// Returns a slice of all residue IDs in the chain. + /// + /// This provides read-only access to the ordered list of residues + /// belonging to the chain. + /// + /// # Return + /// + /// A slice containing all residue IDs in order. pub fn residues(&self) -> &[ResidueId] { &self.residues } From 3e46f4f9001005f4dff99cf431fb7dee1f228ca3 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:39:34 -0700 Subject: [PATCH 15/83] docs(core): Add documentation for BondOrder and Bond structs --- .../scream-core/src/core/models/topology.rs | 73 ++++++++++++++++++- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/crates/scream-core/src/core/models/topology.rs b/crates/scream-core/src/core/models/topology.rs index 40ac5f21..3c168c26 100644 --- a/crates/scream-core/src/core/models/topology.rs +++ b/crates/scream-core/src/core/models/topology.rs @@ -3,27 +3,60 @@ use std::fmt; use std::str::FromStr; use thiserror::Error; +/// Represents the order of a chemical bond between atoms. +/// +/// This enum defines the possible bond orders used in molecular topology, +/// supporting common bond types in organic and biological molecules. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[repr(u8)] pub enum BondOrder { + /// Single bond. Single, + /// Double bond. Double, + /// Triple bond. Triple, + /// Aromatic bond. Aromatic, } impl Default for BondOrder { + /// Returns the default bond order. + /// + /// The default is `BondOrder::Single`, representing the most common bond type. fn default() -> Self { BondOrder::Single } } +/// Error type for failed parsing of bond order strings. +/// +/// This error is returned when attempting to parse an invalid +/// string into a `BondOrder`. #[derive(Debug, Error)] #[error("Invalid bond order string")] pub struct ParseBondOrderError; impl FromStr for BondOrder { type Err = ParseBondOrderError; + + /// Parses a string into a `BondOrder`. + /// + /// This implementation supports multiple string representations + /// for each bond order, including numeric and textual forms. + /// It is case-insensitive. + /// + /// # Arguments + /// + /// * `s` - The string to parse. + /// + /// # Return + /// + /// Returns the parsed `BondOrder` if successful. + /// + /// # Errors + /// + /// Returns `ParseBondOrderError` if the string is invalid. fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "1" | "s" | "single" => Ok(Self::Single), @@ -36,6 +69,10 @@ impl FromStr for BondOrder { } impl fmt::Display for BondOrder { + /// Formats the `BondOrder` as a human-readable string. + /// + /// This implementation allows `BondOrder` to be displayed as a string + /// using capitalized names for each variant. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -50,15 +87,33 @@ impl fmt::Display for BondOrder { } } +/// Represents a chemical bond between two atoms. +/// +/// This struct defines a bond with its constituent atoms and order, +/// providing the basic topology information for molecular structures. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct Bond { - pub atom1_id: AtomId, // ID of the first atom - pub atom2_id: AtomId, // ID of the second atom - pub order: BondOrder, // Bond order (e.g., single, double, etc.) + /// ID of the first atom in the bond. + pub atom1_id: AtomId, + /// ID of the second atom in the bond. + pub atom2_id: AtomId, + /// The order of the bond. + pub order: BondOrder, } impl Bond { + /// Creates a new `Bond` between two atoms. + /// + /// This constructor normalizes the atom IDs to ensure `atom1_id <= atom2_id`, + /// providing a canonical representation for bond equality and hashing. + /// + /// # Arguments + /// + /// * `atom1_id` - ID of one atom in the bond. + /// * `atom2_id` - ID of the other atom in the bond. + /// * `order` - The order of the bond. pub fn new(atom1_id: AtomId, atom2_id: AtomId, order: BondOrder) -> Self { + // Normalize atom IDs to ensure consistent ordering for equality and hashing. let (atom1_id, atom2_id) = if atom1_id <= atom2_id { (atom1_id, atom2_id) } else { @@ -71,6 +126,18 @@ impl Bond { } } + /// Checks if the bond contains a specific atom. + /// + /// This method determines whether the given atom ID is one of the + /// two atoms participating in the bond. + /// + /// # Arguments + /// + /// * `atom_id` - The atom ID to check. + /// + /// # Return + /// + /// Returns `true` if the atom is part of the bond, otherwise `false`. pub fn contains(&self, atom_id: AtomId) -> bool { self.atom1_id == atom_id || self.atom2_id == atom_id } From 0bf1c6437008221bac1eba912837c3cf0d6e8718 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:43:05 -0700 Subject: [PATCH 16/83] docs(core): Add documentation for MolecularSystem struct and its methods --- crates/scream-core/src/core/models/system.rs | 258 ++++++++++++++++++- 1 file changed, 253 insertions(+), 5 deletions(-) diff --git a/crates/scream-core/src/core/models/system.rs b/crates/scream-core/src/core/models/system.rs index 04b1aa79..a0d4ea12 100644 --- a/crates/scream-core/src/core/models/system.rs +++ b/crates/scream-core/src/core/models/system.rs @@ -6,67 +6,184 @@ use super::topology::{Bond, BondOrder}; use slotmap::{SecondaryMap, SlotMap}; use std::collections::HashMap; +/// Represents a complete molecular system with atoms, residues, chains, and bonds. +/// +/// This struct serves as the central data structure for molecular modeling, +/// providing efficient storage and access to all molecular components. +/// It maintains internal caches and lookup maps for performance optimization. #[derive(Debug, Clone, Default)] pub struct MolecularSystem { - // --- Primary Data Stores (Source of Truth) --- + /// Primary storage for atoms using a slot map for efficient ID management. atoms: SlotMap, + /// Primary storage for residues using a slot map for efficient ID management. residues: SlotMap, + /// Primary storage for chains using a slot map for efficient ID management. chains: SlotMap, + /// List of all bonds in the system. bonds: Vec, - - // --- Lookup Maps for Fast Access --- + /// Lookup map for finding residues by chain ID and residue number. residue_id_map: HashMap<(ChainId, isize), ResidueId>, + /// Lookup map for finding chains by their single-character identifier. chain_id_map: HashMap, - - // --- Adjacency information (cache for performance) --- + /// Cached adjacency list for bond connectivity, indexed by atom ID. bond_adjacency: SecondaryMap>, } impl MolecularSystem { + /// Creates a new, empty molecular system. + /// + /// This constructor initializes all internal data structures + /// and is ready for adding chains, residues, and atoms. pub fn new() -> Self { Self::default() } + /// Retrieves an immutable reference to an atom by its ID. + /// + /// # Arguments + /// + /// * `id` - The atom ID to look up. + /// + /// # Return + /// + /// Returns `Some(&Atom)` if the atom exists, otherwise `None`. pub fn atom(&self, id: AtomId) -> Option<&Atom> { self.atoms.get(id) } + + /// Retrieves a mutable reference to an atom by its ID. + /// + /// # Arguments + /// + /// * `id` - The atom ID to look up. + /// + /// # Return + /// + /// Returns `Some(&mut Atom)` if the atom exists, otherwise `None`. pub fn atom_mut(&mut self, id: AtomId) -> Option<&mut Atom> { self.atoms.get_mut(id) } + + /// Returns an iterator over all atoms in the system. + /// + /// # Return + /// + /// An iterator yielding `(AtomId, &Atom)` pairs. pub fn atoms_iter(&self) -> impl Iterator { self.atoms.iter() } + + /// Returns a mutable iterator over all atoms in the system. + /// + /// # Return + /// + /// An iterator yielding `(AtomId, &mut Atom)` pairs. pub fn atoms_iter_mut(&mut self) -> impl Iterator { self.atoms.iter_mut() } + /// Retrieves an immutable reference to a residue by its ID. + /// + /// # Arguments + /// + /// * `id` - The residue ID to look up. + /// + /// # Return + /// + /// Returns `Some(&Residue)` if the residue exists, otherwise `None`. pub fn residue(&self, id: ResidueId) -> Option<&Residue> { self.residues.get(id) } + + /// Retrieves a mutable reference to a residue by its ID. + /// + /// # Arguments + /// + /// * `id` - The residue ID to look up. + /// + /// # Return + /// + /// Returns `Some(&mut Residue)` if the residue exists, otherwise `None`. pub fn residue_mut(&mut self, id: ResidueId) -> Option<&mut Residue> { self.residues.get_mut(id) } + + /// Returns an iterator over all residues in the system. + /// + /// # Return + /// + /// An iterator yielding `(ResidueId, &Residue)` pairs. pub fn residues_iter(&self) -> impl Iterator { self.residues.iter() } + /// Retrieves an immutable reference to a chain by its ID. + /// + /// # Arguments + /// + /// * `id` - The chain ID to look up. + /// + /// # Return + /// + /// Returns `Some(&Chain)` if the chain exists, otherwise `None`. pub fn chain(&self, id: ChainId) -> Option<&Chain> { self.chains.get(id) } + + /// Retrieves a mutable reference to a chain by its ID. + /// + /// # Arguments + /// + /// * `id` - The chain ID to look up. + /// + /// # Return + /// + /// Returns `Some(&mut Chain)` if the chain exists, otherwise `None`. pub fn chain_mut(&mut self, id: ChainId) -> Option<&mut Chain> { self.chains.get_mut(id) } + + /// Returns an iterator over all chains in the system. + /// + /// # Return + /// + /// An iterator yielding `(ChainId, &Chain)` pairs. pub fn chains_iter(&self) -> impl Iterator { self.chains.iter() } + /// Returns a slice of all bonds in the system. + /// + /// # Return + /// + /// A slice containing all bonds. pub fn bonds(&self) -> &[Bond] { &self.bonds } + /// Finds a chain ID by its single-character identifier. + /// + /// # Arguments + /// + /// * `id` - The character identifier of the chain. + /// + /// # Return + /// + /// Returns `Some(ChainId)` if the chain exists, otherwise `None`. pub fn find_chain_by_id(&self, id: char) -> Option { self.chain_id_map.get(&id).copied() } + + /// Finds a residue ID by its chain ID and residue number. + /// + /// # Arguments + /// + /// * `chain_id` - The ID of the chain containing the residue. + /// * `residue_number` - The sequential number of the residue. + /// + /// # Return + /// + /// Returns `Some(ResidueId)` if the residue exists, otherwise `None`. pub fn find_residue_by_id( &self, chain_id: ChainId, @@ -77,6 +194,19 @@ impl MolecularSystem { .copied() } + /// Adds a new chain to the system or returns the existing one. + /// + /// This method is idempotent; if a chain with the given ID already exists, + /// it returns the existing chain ID without creating a duplicate. + /// + /// # Arguments + /// + /// * `id` - The single-character identifier for the chain. + /// * `chain_type` - The type of the chain. + /// + /// # Return + /// + /// The ID of the chain (new or existing). pub fn add_chain(&mut self, id: char, chain_type: ChainType) -> ChainId { *self.chain_id_map.entry(id).or_insert_with(|| { let chain = Chain::new(id, chain_type); @@ -84,6 +214,21 @@ impl MolecularSystem { }) } + /// Adds a new residue to the system or returns the existing one. + /// + /// This method is idempotent; if a residue with the given chain ID and + /// residue number already exists, it returns the existing residue ID. + /// + /// # Arguments + /// + /// * `chain_id` - The ID of the chain to add the residue to. + /// * `residue_number` - The sequential number of the residue. + /// * `name` - The name of the residue. + /// * `residue_type` - The type of the residue, if known. + /// + /// # Return + /// + /// Returns `Some(ResidueId)` if successful, otherwise `None` (e.g., if chain doesn't exist). pub fn add_residue( &mut self, chain_id: ChainId, @@ -106,6 +251,19 @@ impl MolecularSystem { Some(residue_id) } + /// Adds an atom to a specific residue. + /// + /// This method inserts the atom into the system and registers it with the given residue. + /// It also initializes the bond adjacency list for the new atom. + /// + /// # Arguments + /// + /// * `residue_id` - The ID of the residue to add the atom to. + /// * `atom` - The atom to add. + /// + /// # Return + /// + /// Returns `Some(AtomId)` if successful, otherwise `None` (e.g., if residue doesn't exist). pub fn add_atom_to_residue(&mut self, residue_id: ResidueId, atom: Atom) -> Option { if !self.residues.contains_key(residue_id) { return None; @@ -122,6 +280,21 @@ impl MolecularSystem { Some(atom_id) } + /// Adds a bond between two atoms. + /// + /// This method creates a bond between the specified atoms and updates + /// the adjacency cache. It is idempotent; adding an existing bond + /// succeeds without creating duplicates. + /// + /// # Arguments + /// + /// * `atom1_id` - ID of the first atom. + /// * `atom2_id` - ID of the second atom. + /// * `order` - The order of the bond. + /// + /// # Return + /// + /// Returns `Some(())` if successful, otherwise `None` (e.g., if atoms don't exist). pub fn add_bond(&mut self, atom1_id: AtomId, atom2_id: AtomId, order: BondOrder) -> Option<()> { if !self.atoms.contains_key(atom1_id) || !self.atoms.contains_key(atom2_id) { return None; @@ -140,6 +313,19 @@ impl MolecularSystem { Some(()) } + /// Removes an atom from the system. + /// + /// This method removes the atom and all associated data, including + /// bonds and adjacency information. It updates the parent residue + /// and cleans up all references. + /// + /// # Arguments + /// + /// * `atom_id` - The ID of the atom to remove. + /// + /// # Return + /// + /// Returns `Some(Atom)` if the atom existed and was removed, otherwise `None`. pub fn remove_atom(&mut self, atom_id: AtomId) -> Option { let atom = self.atoms.remove(atom_id)?; @@ -166,6 +352,18 @@ impl MolecularSystem { Some(atom) } + /// Removes a residue from the system. + /// + /// This method removes the residue and all its atoms, updating + /// the parent chain and cleaning up all references and bonds. + /// + /// # Arguments + /// + /// * `residue_id` - The ID of the residue to remove. + /// + /// # Return + /// + /// Returns `Some(Residue)` if the residue existed and was removed, otherwise `None`. pub fn remove_residue(&mut self, residue_id: ResidueId) -> Option { let residue = self.residues.get(residue_id)?.clone(); // Clone to avoid borrow checker issues @@ -188,28 +386,73 @@ impl MolecularSystem { self.residues.remove(residue_id) } + /// Retrieves the bonded neighbors of an atom. + /// + /// This method returns the list of atoms directly bonded to the given atom, + /// using the cached adjacency information. + /// + /// # Arguments + /// + /// * `atom_id` - The ID of the atom to query. + /// + /// # Return + /// + /// Returns `Some(&[AtomId])` if the atom exists, otherwise `None`. pub fn get_bonded_neighbors(&self, atom_id: AtomId) -> Option<&[AtomId]> { self.bond_adjacency.get(atom_id).map(|v| v.as_slice()) } + /// Returns an iterator over atoms with a specific role. + /// + /// # Arguments + /// + /// * `role` - The atom role to filter by. + /// + /// # Return + /// + /// An iterator yielding `(AtomId, &Atom)` pairs for matching atoms. pub fn atoms_by_role(&self, role: AtomRole) -> impl Iterator { self.atoms.iter().filter(move |(_, atom)| atom.role == role) } + /// Returns a vector of atom IDs with a specific role. + /// + /// # Arguments + /// + /// * `role` - The atom role to filter by. + /// + /// # Return + /// + /// A vector containing the IDs of matching atoms. pub fn atom_ids_by_role(&self, role: AtomRole) -> Vec { self.atoms_by_role(role).map(|(id, _)| id).collect() } + /// Returns an iterator over protein atoms (backbone and sidechain). + /// + /// # Return + /// + /// An iterator yielding `(AtomId, &Atom)` pairs for protein atoms. pub fn protein_atoms(&self) -> impl Iterator { self.atoms .iter() .filter(|(_, atom)| matches!(atom.role, AtomRole::Backbone | AtomRole::Sidechain)) } + /// Returns a vector of protein atom IDs. + /// + /// # Return + /// + /// A vector containing the IDs of protein atoms. pub fn protein_atom_ids(&self) -> Vec { self.protein_atoms().map(|(id, _)| id).collect() } + /// Returns an iterator over background atoms (ligands, water, other). + /// + /// # Return + /// + /// An iterator yielding `(AtomId, &Atom)` pairs for background atoms. pub fn background_atoms(&self) -> impl Iterator { self.atoms.iter().filter(|(_, atom)| { matches!( @@ -219,6 +462,11 @@ impl MolecularSystem { }) } + /// Returns a vector of background atom IDs. + /// + /// # Return + /// + /// A vector containing the IDs of background atoms. pub fn background_atom_ids(&self) -> Vec { self.background_atoms().map(|(id, _)| id).collect() } From 30d5028936e685641ee8df5747d20ecced66b686 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:43:45 -0700 Subject: [PATCH 17/83] docs(core): Add module documentation for core data models --- crates/scream-core/src/core/models/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/scream-core/src/core/models/mod.rs b/crates/scream-core/src/core/models/mod.rs index 2f1a8ff1..da24b753 100644 --- a/crates/scream-core/src/core/models/mod.rs +++ b/crates/scream-core/src/core/models/mod.rs @@ -1,3 +1,10 @@ +//! Core data models for molecular structures. +//! +//! This module contains the fundamental data structures used to represent +//! molecular systems in SCREAM++, including atoms, residues, chains, and +//! their topological relationships. These models provide the foundation +//! for molecular modeling and simulation algorithms. + pub mod atom; pub mod chain; pub mod ids; From 0357bbf46fa54fc334ffd042cfecc5b9152326d6 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:49:13 -0700 Subject: [PATCH 18/83] docs(core): Add documentation for MolecularFile trait with additional methods for file I/O --- crates/scream-core/src/core/io/traits.rs | 92 ++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/crates/scream-core/src/core/io/traits.rs b/crates/scream-core/src/core/io/traits.rs index dccce5f9..b6697f4b 100644 --- a/crates/scream-core/src/core/io/traits.rs +++ b/crates/scream-core/src/core/io/traits.rs @@ -4,25 +4,88 @@ use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Write}; use std::path::Path; +/// Defines the interface for reading and writing molecular file formats. +/// +/// This trait provides a common API for molecular file I/O operations, +/// supporting both reading from and writing to various file formats. +/// Implementors handle format-specific parsing and serialization. pub trait MolecularFile { + /// The type of metadata associated with the file format. type Metadata; + + /// The error type for I/O operations. type Error: Error + From; + /// Reads a molecular system from a buffered reader. + /// + /// # Arguments + /// + /// * `reader` - The buffered reader to read from. + /// + /// # Return + /// + /// Returns the parsed molecular system and associated metadata. + /// + /// # Errors + /// + /// Returns an error if parsing fails or I/O operations encounter issues. fn read_from( reader: &mut impl BufRead, ) -> Result<(MolecularSystem, Self::Metadata), Self::Error>; + /// Writes a molecular system and metadata to a writer. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `metadata` - The metadata to include in the output. + /// * `writer` - The writer to output to. + /// + /// # Return + /// + /// Returns `Ok(())` on success. + /// + /// # Errors + /// + /// Returns an error if writing fails or I/O operations encounter issues. fn write_to( system: &MolecularSystem, metadata: &Self::Metadata, writer: &mut impl Write, ) -> Result<(), Self::Error>; + /// Writes a molecular system to a writer without metadata. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `writer` - The writer to output to. + /// + /// # Return + /// + /// Returns `Ok(())` on success. + /// + /// # Errors + /// + /// Returns an error if writing fails or I/O operations encounter issues. fn write_system_to( system: &MolecularSystem, writer: &mut impl Write, ) -> Result<(), Self::Error>; + /// Reads a molecular system from a file path. + /// + /// # Arguments + /// + /// * `path` - The path to the file to read. + /// + /// # Return + /// + /// Returns the parsed molecular system and associated metadata. + /// + /// # Errors + /// + /// Returns an error if the file cannot be opened or parsing fails. fn read_from_path>( path: P, ) -> Result<(MolecularSystem, Self::Metadata), Self::Error> { @@ -31,6 +94,21 @@ pub trait MolecularFile { Self::read_from(&mut reader) } + /// Writes a molecular system and metadata to a file path. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `metadata` - The metadata to include in the output. + /// * `path` - The path to the file to write. + /// + /// # Return + /// + /// Returns `Ok(())` on success. + /// + /// # Errors + /// + /// Returns an error if the file cannot be created or writing fails. fn write_to_path>( system: &MolecularSystem, metadata: &Self::Metadata, @@ -41,6 +119,20 @@ pub trait MolecularFile { Self::write_to(system, metadata, &mut writer) } + /// Writes a molecular system to a file path without metadata. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `path` - The path to the file to write. + /// + /// # Return + /// + /// Returns `Ok(())` on success. + /// + /// # Errors + /// + /// Returns an error if the file cannot be created or writing fails. fn write_system_to_path>( system: &MolecularSystem, path: P, From f006c67b2d64cb7621e17301fbd25523e7ad91a0 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:54:01 -0700 Subject: [PATCH 19/83] docs(core): Add documentation for BgfMetadata, BgfError, and BgfFile methods --- crates/scream-core/src/core/io/bgf.rs | 176 ++++++++++++++++++++++++-- 1 file changed, 162 insertions(+), 14 deletions(-) diff --git a/crates/scream-core/src/core/io/bgf.rs b/crates/scream-core/src/core/io/bgf.rs index 1545cee0..f42671a2 100644 --- a/crates/scream-core/src/core/io/bgf.rs +++ b/crates/scream-core/src/core/io/bgf.rs @@ -11,20 +11,42 @@ use std::collections::{BTreeSet, HashMap}; use std::io::{self, BufRead, Write}; use thiserror::Error; +/// Metadata associated with a BGF file, containing header information and other non-structural data. +/// +/// This struct holds information that is not part of the molecular system's topology but is +/// preserved from the original BGF file, such as header lines that may contain remarks or +/// force field information. #[derive(Debug, Default, Clone)] pub struct BgfMetadata { - pub header_lines: Vec, // Lines before the ATOM records + /// Lines from the BGF file header that appear before the atom records. + /// + /// These typically include remarks, force field specifications, and other metadata + /// that should be preserved when writing the file back out. + pub header_lines: Vec, } +/// A handler for reading and writing BGF (Biograf) molecular structure files. +/// +/// This struct implements the `MolecularFile` trait to provide support for the BGF format, +/// which is commonly used in molecular simulations and modeling software. It handles +/// parsing atom records, connectivity information, and metadata while building a +/// `MolecularSystem` representation. #[derive(Debug, Default)] pub struct BgfFile; +/// Errors that can occur during BGF file processing. +/// +/// This enum covers I/O errors, parsing failures, and logical inconsistencies +/// encountered when reading or writing BGF files. #[derive(Debug, Error)] pub enum BgfError { + /// An I/O error occurred during file operations. #[error("I/O error: {0}")] Io(#[from] io::Error), + /// A parsing error occurred on a specific line. #[error("Parse error on line {line_num}: {message}")] Parse { line_num: usize, message: String }, + /// A logical error occurred during file processing. #[error("Logic error during file processing: {0}")] Logic(String), } @@ -33,6 +55,26 @@ impl MolecularFile for BgfFile { type Metadata = BgfMetadata; type Error = BgfError; + /// Reads a molecular system from a BGF file. + /// + /// This method parses the BGF file format, extracting atoms, residues, chains, + /// and connectivity information to construct a `MolecularSystem`. It handles + /// both ATOM and HETATM records, automatically categorizing chains based on + /// residue types, and processes CONECT records to establish bonds. + /// + /// # Arguments + /// + /// * `reader` - A buffered reader providing the BGF file content. + /// + /// # Return + /// + /// Returns a tuple containing the constructed `MolecularSystem` and any + /// associated metadata extracted from the file. + /// + /// # Errors + /// + /// Returns a `BgfError` if parsing fails due to malformed input, I/O issues, + /// or logical inconsistencies in the file structure. fn read_from( reader: &mut impl BufRead, ) -> Result<(MolecularSystem, Self::Metadata), Self::Error> { @@ -154,6 +196,26 @@ impl MolecularFile for BgfFile { Ok((system, metadata)) } + /// Writes a molecular system to a BGF file with associated metadata. + /// + /// This method serializes the `MolecularSystem` into the BGF format, including + /// atom records, connectivity information, and any provided metadata. Atoms are + /// sorted into canonical order for consistent output, and serial numbers are + /// reassigned sequentially. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `metadata` - Metadata to include in the output file. + /// * `writer` - A writer to output the BGF file content. + /// + /// # Return + /// + /// Returns `Ok(())` on successful writing. + /// + /// # Errors + /// + /// Returns a `BgfError` if writing fails due to I/O issues. fn write_to( system: &MolecularSystem, metadata: &Self::Metadata, @@ -233,6 +295,24 @@ impl MolecularFile for BgfFile { Ok(()) } + /// Writes a molecular system to a BGF file with default metadata. + /// + /// This is a convenience method that writes the system using minimal default + /// metadata, including standard BGF header lines for BIOGRF version and + /// force field information. + /// + /// # Arguments + /// + /// * `system` - The molecular system to write. + /// * `writer` - A writer to output the BGF file content. + /// + /// # Return + /// + /// Returns `Ok(())` on successful writing. + /// + /// # Errors + /// + /// Returns a `BgfError` if writing fails due to I/O issues. fn write_system_to( system: &MolecularSystem, writer: &mut impl Write, @@ -245,19 +325,24 @@ impl MolecularFile for BgfFile { } } -#[derive(Debug, Clone)] -struct ParsedAtomInfo { - record_type: String, - serial: usize, - name: String, - res_name: String, - chain_char: char, - res_seq: isize, - pos: Point3, - charge: f64, - ff_type: String, -} - +/// Parses a single ATOM or HETATM line from a BGF file. +/// +/// This function extracts all relevant information from a BGF atom record, +/// including position, charge, and force field type, returning a structured +/// representation suitable for building the molecular system. +/// +/// # Arguments +/// +/// * `line` - The BGF atom record line to parse. +/// +/// # Return +/// +/// Returns a `ParsedAtomInfo` struct containing the parsed data. +/// +/// # Errors +/// +/// Returns a `String` describing the parsing error if the line is malformed +/// or contains invalid data. fn parse_atom_line(line: &str) -> Result { let get_slice = |start, end| { line.get(start..end) @@ -304,6 +389,24 @@ fn parse_atom_line(line: &str) -> Result { }) } +/// Parses a CONECT line from a BGF file to extract connectivity information. +/// +/// This function processes CONECT records, which specify bonds between atoms +/// using their serial numbers, returning the base atom and its connected atoms. +/// +/// # Arguments +/// +/// * `line` - The BGF CONECT record line to parse. +/// +/// # Return +/// +/// Returns a tuple containing the base atom serial and a vector of connected +/// atom serials. +/// +/// # Errors +/// +/// Returns a `String` describing the parsing error if the line is malformed +/// or contains invalid serial numbers. fn parse_conect_line(line: &str) -> Result<(usize, Vec), String> { let mut parts = line.split_whitespace().skip(1); let base_serial_str = parts.next().ok_or("Missing base serial in CONECT")?; @@ -319,6 +422,25 @@ fn parse_conect_line(line: &str) -> Result<(usize, Vec), String> { Ok((base_serial, connected_serials)) } +/// Formats an atom record line for output in BGF format. +/// +/// This function constructs a properly formatted BGF atom line from the +/// provided atom, residue, and chain information, including connectivity +/// and charge data. +/// +/// # Arguments +/// +/// * `record_type` - The record type ("ATOM" or "HETATM"). +/// * `serial` - The sequential serial number for the atom. +/// * `atom` - The atom to format. +/// * `residue` - The residue containing the atom. +/// * `chain` - The chain containing the residue. +/// * `atoms_connected` - Number of atoms connected to this atom. +/// * `lone_pairs` - Number of lone pairs (currently placeholder). +/// +/// # Return +/// +/// Returns the formatted BGF atom line as a string. fn format_atom_line( record_type: &str, serial: usize, @@ -346,6 +468,32 @@ fn format_atom_line( ) } +/// Internal structure holding parsed information from a BGF atom line. +/// +/// This struct is used internally during parsing to temporarily store +/// atom information before creating the actual `Atom` instance. +#[derive(Debug, Clone)] +struct ParsedAtomInfo { + /// The record type ("ATOM" or "HETATM"). + record_type: String, + /// The original serial number from the file. + serial: usize, + /// The atom name. + name: String, + /// The residue name. + res_name: String, + /// The chain identifier character. + chain_char: char, + /// The residue sequence number. + res_seq: isize, + /// The 3D position of the atom. + pos: Point3, + /// The partial charge of the atom. + charge: f64, + /// The force field type of the atom. + ff_type: String, +} + #[cfg(test)] mod tests { use super::*; From 135fcd35a5582b8c12863642f14575d1e870e1ab Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:54:27 -0700 Subject: [PATCH 20/83] docs(core): Add documentation for CanonicalAtom struct and sorting functions --- .../scream-core/src/core/io/sorting/sorter.rs | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/crates/scream-core/src/core/io/sorting/sorter.rs b/crates/scream-core/src/core/io/sorting/sorter.rs index a571bf84..4eb86dae 100644 --- a/crates/scream-core/src/core/io/sorting/sorter.rs +++ b/crates/scream-core/src/core/io/sorting/sorter.rs @@ -4,14 +4,45 @@ use crate::core::models::ids::AtomId; use crate::core::models::system::MolecularSystem; use std::cmp::Ordering; +/// Represents an atom in the context of canonical sorting for molecular file output. +/// +/// This struct wraps an atom with additional metadata needed for sorting, +/// including the chain character and residue number for hierarchical ordering. +/// It is used internally during the sorting process to maintain references +/// to the original atom data while providing sorting keys. #[derive(Debug)] pub struct CanonicalAtom<'a> { + /// The unique identifier of the atom in the molecular system. pub id: AtomId, + /// A reference to the original atom data. pub source: &'a Atom, + /// The character identifier of the chain containing this atom. pub chain_char: char, + /// The sequence number of the residue containing this atom. pub residue_number: isize, } +/// Sorts all atoms in a molecular system into a canonical order for consistent file output. +/// +/// This function implements a multi-level sorting algorithm that organizes atoms +/// hierarchically by chain, then by residue sequence, and finally by atom name +/// according to standard molecular naming conventions. This ensures that molecular +/// files are written in a predictable, canonical order regardless of the internal +/// storage order of atoms in the system. +/// +/// The sorting uses predefined atom name aliases and weights to handle common +/// naming variations and establish a biologically meaningful atom order within +/// each residue. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the atoms to sort. +/// +/// # Return +/// +/// Returns a vector of `CanonicalAtom` instances sorted in canonical order. +/// Each element contains the atom ID, a reference to the original atom, and +/// the sorting metadata (chain character and residue number). pub fn sort_system_atoms(system: &MolecularSystem) -> Vec { let mut atoms_to_sort: Vec = system .atoms_iter() @@ -44,6 +75,23 @@ pub fn sort_system_atoms(system: &MolecularSystem) -> Vec { atoms_to_sort } +/// Compares two atom names according to canonical molecular naming conventions. +/// +/// This function implements the third level of sorting in the canonical atom order. +/// It handles atom name aliases (e.g., "HCA" as "HA") and uses predefined weights +/// to establish a biologically meaningful order for atoms within a residue. +/// Unknown atom names are sorted after known ones, and among unknowns, +/// alphabetical order is used. +/// +/// # Arguments +/// +/// * `name_a` - The first atom name to compare. +/// * `name_b` - The second atom name to compare. +/// +/// # Return +/// +/// Returns an `Ordering` indicating whether `name_a` should come before, +/// after, or at the same position as `name_b` in canonical order. fn compare_atom_names(name_a: &str, name_b: &str) -> Ordering { let trimmed_a = name_a.trim(); let trimmed_b = name_b.trim(); From 628c0ff89bcd1fd6141f90434858085d078bd17e Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:56:13 -0700 Subject: [PATCH 21/83] docs(core): Add module documentation for sorting utilities --- crates/scream-core/src/core/io/sorting/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/scream-core/src/core/io/sorting/mod.rs b/crates/scream-core/src/core/io/sorting/mod.rs index c41eeb20..55916ed7 100644 --- a/crates/scream-core/src/core/io/sorting/mod.rs +++ b/crates/scream-core/src/core/io/sorting/mod.rs @@ -1,2 +1,10 @@ +//! Utilities for sorting molecular system components into canonical order. +//! +//! This module contains functionality for organizing atoms, residues, and other +//! molecular components into a consistent, biologically meaningful order for +//! file output and processing. The sorting ensures reproducible results across +//! different molecular file formats and maintains the expected ordering used +//! in computational chemistry and structural biology applications. + pub mod rules; pub mod sorter; From 6e51f5e580882385aeb8e1db82ca672f5eeec042 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 15:57:59 -0700 Subject: [PATCH 22/83] docs(core): Add module documentation for molecular file I/O functionality --- crates/scream-core/src/core/io/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/scream-core/src/core/io/mod.rs b/crates/scream-core/src/core/io/mod.rs index 84e5f2b6..9cd19545 100644 --- a/crates/scream-core/src/core/io/mod.rs +++ b/crates/scream-core/src/core/io/mod.rs @@ -1,3 +1,10 @@ +//! Provides input/output functionality for molecular file formats. +//! +//! This module contains implementations for reading and writing various molecular +//! structure file formats commonly used in computational chemistry and structural +//! biology. It provides a unified trait-based interface for file I/O operations +//! and includes utilities for canonical ordering of molecular components. + pub mod bgf; pub(crate) mod sorting; pub mod traits; From 1ebdd86a0c86ebf9d04ac9bd821deef767563606 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:01:40 -0700 Subject: [PATCH 23/83] docs(core): Add documentation for potential energy functions in forcefield module --- .../src/core/forcefield/potentials.rs | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/potentials.rs b/crates/scream-core/src/core/forcefield/potentials.rs index 167c3bbc..5c1d2019 100644 --- a/crates/scream-core/src/core/forcefield/potentials.rs +++ b/crates/scream-core/src/core/forcefield/potentials.rs @@ -1,5 +1,24 @@ +/// The Coulomb constant used in electrostatic potential calculations. +/// +/// This constant represents the value of 1/(4πε₀) in units of kcal·Å/(mol·e²), +/// which is the standard unit system used in molecular mechanics force fields. const COULOMB_CONSTANT: f64 = 332.0637; // In kcal·Å/(mol·e²) +/// Calculates the Lennard-Jones 12-6 potential energy between two atoms. +/// +/// This function implements the classic Lennard-Jones potential, which models +/// van der Waals interactions with a repulsive r⁻¹² term and an attractive r⁻⁶ term. +/// The potential reaches its minimum at the specified `r_min` distance. +/// +/// # Arguments +/// +/// * `dist` - The distance between the two atoms. +/// * `r_min` - The distance at which the potential reaches its minimum. +/// * `well_depth` - The depth of the potential well (negative value). +/// +/// # Return +/// +/// Returns the potential energy. Positive values indicate repulsion, negative values indicate attraction. #[inline] pub fn lennard_jones_12_6(dist: f64, r_min: f64, well_depth: f64) -> f64 { if dist < 1e-6 { @@ -11,6 +30,22 @@ pub fn lennard_jones_12_6(dist: f64, r_min: f64, well_depth: f64) -> f64 { well_depth * (rho12 - 2.0 * rho6) } +/// Calculates the Buckingham exponential-6 potential energy between two atoms. +/// +/// This function implements a modified Buckingham potential that switches to +/// pure Lennard-Jones repulsion at short distances to prevent numerical instability. +/// The potential combines exponential repulsion with r⁻⁶ dispersion attraction. +/// +/// # Arguments +/// +/// * `dist` - The distance between the two atoms. +/// * `r_min` - The distance parameter for the potential. +/// * `well_depth` - The depth of the potential well. +/// * `gamma` - The exponential decay parameter. +/// +/// # Return +/// +/// Returns the potential energy. The function switches to r⁻¹² repulsion below 60% of `r_min`. #[inline] pub fn buckingham_exp_6(dist: f64, r_min: f64, well_depth: f64, gamma: f64) -> f64 { const POTENTIAL_SWITCHING_FACTOR: f64 = 0.6; @@ -31,6 +66,21 @@ pub fn buckingham_exp_6(dist: f64, r_min: f64, well_depth: f64, gamma: f64) -> f well_depth * (6.0 / (gamma - 6.0) * (gamma * (1.0 - rho)).exp() - factor * rho.powi(-6)) } +/// Calculates the Coulomb electrostatic potential energy between two charged atoms. +/// +/// This function computes the electrostatic interaction energy using Coulomb's law +/// with the appropriate constant for molecular mechanics units. +/// +/// # Arguments +/// +/// * `dist` - The distance between the two atoms. +/// * `q1` - The charge of the first atom. +/// * `q2` - The charge of the second atom. +/// * `dielectric` - The dielectric constant of the medium. +/// +/// # Return +/// +/// Returns the electrostatic potential energy. The sign depends on the charges. #[inline] pub fn coulomb(dist: f64, q1: f64, q2: f64, dielectric: f64) -> f64 { if dist < 1e-6 { @@ -39,6 +89,21 @@ pub fn coulomb(dist: f64, q1: f64, q2: f64, dielectric: f64) -> f64 { COULOMB_CONSTANT * q1 * q2 / (dielectric * dist) } +/// Calculates the Dreiding hydrogen bond 12-10 potential energy. +/// +/// This function implements the specialized hydrogen bond potential used in the +/// Dreiding force field, which combines r⁻¹² and r⁻¹⁰ terms for hydrogen bonding +/// interactions between donor and acceptor atoms. +/// +/// # Arguments +/// +/// * `dist_ad` - The distance between the donor and acceptor atoms. +/// * `r_hb` - The equilibrium hydrogen bond distance. +/// * `d_hb` - The hydrogen bond well depth. +/// +/// # Return +/// +/// Returns the hydrogen bond potential energy. #[inline] pub fn dreiding_hbond_12_10(dist_ad: f64, r_hb: f64, d_hb: f64) -> f64 { if dist_ad < 1e-6 { @@ -50,6 +115,22 @@ pub fn dreiding_hbond_12_10(dist_ad: f64, r_hb: f64, d_hb: f64) -> f64 { d_hb * (5.0 * rho12 - 6.0 * rho10) } +/// Applies a flat-bottom modification to a van der Waals potential function. +/// +/// This function modifies the behavior of a potential in the repulsive region by +/// creating a flat energy well around the ideal distance, which can improve +/// numerical stability in molecular dynamics simulations. +/// +/// # Arguments +/// +/// * `dist` - The actual distance between atoms. +/// * `ideal_dist` - The ideal equilibrium distance. +/// * `delta` - The width of the flat-bottom region. +/// * `potential_fn` - The base potential function to modify. +/// +/// # Return +/// +/// Returns the modified potential energy. #[inline] pub fn apply_flat_bottom_vdw(dist: f64, ideal_dist: f64, delta: f64, potential_fn: F) -> f64 where @@ -76,6 +157,22 @@ where } } +/// Applies a flat-bottom modification to a hydrogen bond potential function. +/// +/// This function creates a flat energy region around the ideal hydrogen bond +/// distance to stabilize the interaction while maintaining the correct asymptotic +/// behavior at long and short ranges. +/// +/// # Arguments +/// +/// * `dist` - The actual distance between atoms. +/// * `ideal_dist` - The ideal hydrogen bond distance. +/// * `delta` - The width of the flat-bottom region. +/// * `potential_fn` - The base potential function to modify. +/// +/// # Return +/// +/// Returns the modified potential energy. #[inline] pub fn apply_flat_bottom_hbond(dist: f64, ideal_dist: f64, delta: f64, potential_fn: F) -> f64 where From f88c16f4d5219c34a729102610cd338eddcb36d3 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:02:58 -0700 Subject: [PATCH 24/83] docs(core): Add documentation for EnergyTerm struct and its methods --- .../scream-core/src/core/forcefield/term.rs | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/term.rs b/crates/scream-core/src/core/forcefield/term.rs index 7c64ed11..b7310fd4 100644 --- a/crates/scream-core/src/core/forcefield/term.rs +++ b/crates/scream-core/src/core/forcefield/term.rs @@ -1,13 +1,32 @@ use std::ops::{Add, AddAssign}; +/// Represents the energy contributions from different molecular interaction types. +/// +/// This struct encapsulates the separate energy components calculated during +/// molecular mechanics simulations, allowing for detailed analysis of different +/// force field contributions to the total system energy. #[derive(Debug, Clone, Copy, PartialEq, Default)] pub struct EnergyTerm { + /// The van der Waals interaction energy contribution. pub vdw: f64, + /// The electrostatic (Coulomb) interaction energy contribution. pub coulomb: f64, + /// The hydrogen bond interaction energy contribution. pub hbond: f64, } impl EnergyTerm { + /// Creates a new `EnergyTerm` with the specified energy components. + /// + /// # Arguments + /// + /// * `vdw` - The van der Waals energy contribution. + /// * `coulomb` - The electrostatic energy contribution. + /// * `hbond` - The hydrogen bond energy contribution. + /// + /// # Return + /// + /// Returns a new `EnergyTerm` instance with the provided values. pub fn new(vdw: f64, coulomb: f64, hbond: f64) -> Self { Self { vdw, @@ -16,6 +35,11 @@ impl EnergyTerm { } } + /// Calculates the total energy as the sum of all components. + /// + /// # Return + /// + /// Returns the sum of van der Waals, Coulomb, and hydrogen bond energies. #[inline] pub fn total(&self) -> f64 { self.vdw + self.coulomb + self.hbond @@ -25,6 +49,18 @@ impl EnergyTerm { impl Add for EnergyTerm { type Output = Self; + /// Adds two `EnergyTerm` instances component-wise. + /// + /// This implementation allows combining energy contributions from different + /// sources or time steps in molecular dynamics simulations. + /// + /// # Arguments + /// + /// * `rhs` - The right-hand side `EnergyTerm` to add. + /// + /// # Return + /// + /// Returns a new `EnergyTerm` with summed components. fn add(self, rhs: Self) -> Self::Output { Self { vdw: self.vdw + rhs.vdw, @@ -35,6 +71,14 @@ impl Add for EnergyTerm { } impl AddAssign for EnergyTerm { + /// Adds another `EnergyTerm` to this one in place. + /// + /// This allows accumulating energy contributions efficiently without + /// creating intermediate instances. + /// + /// # Arguments + /// + /// * `rhs` - The right-hand side `EnergyTerm` to add. fn add_assign(&mut self, rhs: Self) { self.vdw += rhs.vdw; self.coulomb += rhs.coulomb; From 59c9bf2a731695ac84fbbb3ece964762a82e93b0 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:05:57 -0700 Subject: [PATCH 25/83] docs(core): Add documentation for force field parameters and loading methods --- .../scream-core/src/core/forcefield/params.rs | 147 +++++++++++++++++- 1 file changed, 146 insertions(+), 1 deletion(-) diff --git a/crates/scream-core/src/core/forcefield/params.rs b/crates/scream-core/src/core/forcefield/params.rs index b2d1b7d5..e8630242 100644 --- a/crates/scream-core/src/core/forcefield/params.rs +++ b/crates/scream-core/src/core/forcefield/params.rs @@ -4,56 +4,114 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use thiserror::Error; +/// Represents the parameters for van der Waals interactions. +/// +/// This enum supports two common potential functions used in molecular mechanics: +/// Buckingham exponential-6 and Lennard-Jones 12-6 potentials. #[derive(Debug, Deserialize, Clone, PartialEq)] #[serde(untagged)] pub enum VdwParam { + /// Parameters for the Buckingham exponential-6 potential. + /// + /// This potential combines exponential repulsion with r⁻⁶ dispersion attraction, + /// providing better long-range behavior than Lennard-Jones for some systems. Buckingham { + /// The van der Waals radius parameter. radius: f64, + /// The potential well depth. well_depth: f64, + /// The exponential decay parameter. scale: f64, }, + /// Parameters for the Lennard-Jones 12-6 potential. + /// + /// This is the classic potential with r⁻¹² repulsion and r⁻⁶ attraction terms. LennardJones { + /// The van der Waals radius parameter. radius: f64, + /// The potential well depth. well_depth: f64, }, } +/// Parameters for hydrogen bond interactions. +/// +/// Hydrogen bonds are directional interactions between donor and acceptor atoms, +/// modeled with a specialized potential function. #[derive(Debug, Deserialize, Clone, PartialEq)] pub struct HBondParam { + /// The equilibrium distance for the hydrogen bond. pub equilibrium_distance: f64, + /// The depth of the hydrogen bond potential well. pub well_depth: f64, } +/// Global parameters that apply to the entire force field. +/// +/// These parameters define the overall behavior of the force field calculations, +/// such as the dielectric constant and the type of potential function used. #[derive(Debug, Deserialize, Clone, PartialEq, Default)] pub struct GlobalParams { + /// The dielectric constant used in electrostatic calculations. pub dielectric_constant: f64, + /// The name of the potential function to use for van der Waals interactions. pub potential_function: String, } +/// Parameters for non-bonded interactions in the force field. +/// +/// This struct contains all the parameters needed for calculating van der Waals, +/// electrostatic, and hydrogen bond interactions between atoms. #[derive(Debug, Deserialize, Clone, PartialEq)] pub struct NonBondedParams { + /// Global parameters for the force field. pub globals: GlobalParams, + /// Van der Waals parameters for each atom type. pub vdw: HashMap, + /// Hydrogen bond parameters for donor-acceptor pairs. pub hbond: HashMap, + /// Set of atom types that can act as hydrogen bond donors. + /// + /// This field is populated automatically during loading based on the + /// hydrogen bond parameter keys. #[serde(skip)] pub hbond_donors: HashSet, + /// Set of atom types that can act as hydrogen bond acceptors. + /// + /// This field is populated automatically during loading based on the + /// hydrogen bond parameter keys. #[serde(skip)] pub hbond_acceptors: HashSet, } +/// Parameters for atom-specific corrections (deltas) in the force field. +/// +/// These parameters allow for fine-tuning of atomic properties on a per-residue, +/// per-atom basis, typically used for improving agreement with experimental data. #[derive(Debug, Deserialize, Clone)] pub struct DeltaParam { + /// The residue type for which this delta applies. pub residue_type: String, + /// The atom name within the residue. pub atom_name: String, + /// The mean correction value. pub mu: f64, + /// The standard deviation of the correction. pub sigma: f64, } +/// Weights for different energy components in force field calculations. +/// +/// This struct allows scaling the contribution of different interaction types +/// to the total energy, which can be useful for optimization or analysis. #[derive(Debug, Clone, Copy, PartialEq)] pub struct EnergyComponentWeights { + /// Weight for van der Waals interactions. pub vdw: f64, + /// Weight for electrostatic interactions. pub coulomb: f64, + /// Weight for hydrogen bond interactions. pub hbond: f64, } @@ -67,41 +125,95 @@ impl Default for EnergyComponentWeights { } } +/// A rule for applying energy weights based on atom roles. +/// +/// This struct defines how energy components should be weighted when calculating +/// interactions between atoms of specific roles (e.g., backbone vs. sidechain). #[derive(Debug, Clone, PartialEq)] pub struct WeightRule { + /// The pair of atom roles to which this rule applies. pub groups: [AtomRole; 2], + /// The weights to apply for interactions between these roles. pub weights: EnergyComponentWeights, } +/// Configuration for energy component weights across different atom role pairs. +/// +/// This struct contains a collection of rules that define how energy components +/// should be weighted based on the roles of the interacting atoms. #[derive(Debug, Clone, PartialEq, Default)] pub struct EnergyWeights { + /// The list of weighting rules to apply. pub rules: Vec, } +/// The complete force field parameter set. +/// +/// This struct encapsulates all parameters needed for molecular mechanics +/// calculations, including non-bonded interactions, atomic corrections, +/// and energy weighting rules. #[derive(Debug, Clone)] pub struct Forcefield { + /// Parameters for non-bonded interactions. pub non_bonded: NonBondedParams, + /// Atomic correction parameters indexed by (residue_type, atom_name). pub deltas: HashMap<(String, String), DeltaParam>, + /// Energy weights for different atom role pairs. pub weight_map: HashMap<(AtomRole, AtomRole), EnergyComponentWeights>, } +/// Errors that can occur during parameter loading. +/// +/// This enum covers various failure modes when loading force field parameters +/// from configuration files. #[derive(Debug, Error)] pub enum ParamLoadError { + /// An I/O error occurred while reading a file. #[error("File I/O error for '{path}': {source}")] Io { + /// The path to the file that caused the error. path: String, + /// The underlying I/O error. source: std::io::Error, }, + /// A CSV parsing error occurred. #[error("CSV parsing error for '{path}': {source}")] - Csv { path: String, source: csv::Error }, + Csv { + /// The path to the CSV file that caused the error. + path: String, + /// The underlying CSV parsing error. + source: csv::Error, + }, + /// A TOML parsing error occurred. #[error("TOML parsing error for '{path}': {source}")] Toml { + /// The path to the TOML file that caused the error. path: String, + /// The underlying TOML parsing error. source: toml::de::Error, }, } impl Forcefield { + /// Loads a complete force field from configuration files. + /// + /// This method reads non-bonded parameters from a TOML file, delta parameters + /// from a CSV file, and applies energy weighting rules to create a complete + /// force field parameter set. + /// + /// # Arguments + /// + /// * `non_bonded_path` - Path to the TOML file containing non-bonded parameters. + /// * `delta_path` - Path to the CSV file containing delta parameters. + /// * `energy_weights_config` - Configuration for energy component weights. + /// + /// # Return + /// + /// Returns a `Forcefield` instance with all parameters loaded. + /// + /// # Errors + /// + /// Returns a `ParamLoadError` if any of the files cannot be read or parsed. pub fn load( non_bonded_path: &Path, delta_path: &Path, @@ -114,6 +226,7 @@ impl Forcefield { for rule in &energy_weights_config.rules { let a = rule.groups[0]; let b = rule.groups[1]; + // Ensure consistent ordering for the map key let key = if a <= b { (a, b) } else { (b, a) }; weight_map.insert(key, rule.weights); } @@ -125,6 +238,22 @@ impl Forcefield { }) } + /// Loads non-bonded parameters from a TOML file. + /// + /// This method parses the TOML file and automatically populates the + /// hydrogen bond donor and acceptor sets based on the parameter keys. + /// + /// # Arguments + /// + /// * `path` - Path to the TOML file containing non-bonded parameters. + /// + /// # Return + /// + /// Returns the parsed `NonBondedParams`. + /// + /// # Errors + /// + /// Returns a `ParamLoadError` if the file cannot be read or parsed. fn load_non_bonded(path: &Path) -> Result { let content = std::fs::read_to_string(path).map_err(|e| ParamLoadError::Io { path: path.to_string_lossy().to_string(), @@ -154,6 +283,22 @@ impl Forcefield { Ok(params) } + /// Loads delta parameters from a CSV file. + /// + /// This method reads atomic correction parameters from a CSV file and + /// indexes them by residue type and atom name for efficient lookup. + /// + /// # Arguments + /// + /// * `path` - Path to the CSV file containing delta parameters. + /// + /// # Return + /// + /// Returns a map of delta parameters indexed by (residue_type, atom_name). + /// + /// # Errors + /// + /// Returns a `ParamLoadError` if the file cannot be read or parsed. fn load_delta_csv( path: &Path, ) -> Result, ParamLoadError> { From 92758ec48731280c351aacb4e4a1dd2d039e1fc0 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:06:58 -0700 Subject: [PATCH 26/83] docs(core): Add documentation for energy calculation methods and error handling --- .../scream-core/src/core/forcefield/energy.rs | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/energy.rs b/crates/scream-core/src/core/forcefield/energy.rs index b18a85ce..33caef92 100644 --- a/crates/scream-core/src/core/forcefield/energy.rs +++ b/crates/scream-core/src/core/forcefield/energy.rs @@ -4,20 +4,59 @@ use crate::core::models::ids::ResidueId; use std::f64::consts::PI; use thiserror::Error; +/// Represents errors that can occur during energy calculations in the force field. +/// +/// This enum encapsulates various error conditions that may arise when computing +/// interaction energies between atoms, providing detailed context for debugging +/// and error handling in molecular simulations. #[derive(Debug, Error)] pub enum EnergyCalculationError { + /// Indicates that an atom lacks the necessary parameters for van der Waals energy calculation. + /// + /// This error occurs when attempting to compute VDW interactions for atoms that have + /// not been parameterized with appropriate force field parameters (e.g., Lennard-Jones + /// or Buckingham parameters). #[error( "Atom '{atom_name}' in residue {residue_id:?} is not parameterized for VDW calculation" )] UnparameterizedAtom { + /// The name of the atom that is missing parameters. atom_name: String, + /// The ID of the residue containing the unparameterized atom. residue_id: ResidueId, }, } +/// Provides static methods for calculating various types of molecular interaction energies. +/// +/// This struct serves as a utility for computing energy contributions from different +/// force field terms, including van der Waals, electrostatic, and hydrogen bonding +/// interactions. All methods are designed to work with the SCREAM++ molecular +/// data structures and follow standard force field conventions. pub struct EnergyCalculator; impl EnergyCalculator { + /// Calculates the van der Waals interaction energy between two atoms. + /// + /// This method computes the VDW energy using either Lennard-Jones 12-6 or Buckingham + /// exponential-6 potentials, depending on the parameterization of the atoms. It combines + /// parameters from both atoms using standard mixing rules and applies a flat-bottom + /// modification based on the atoms' delta values to handle close contacts. + /// + /// # Arguments + /// + /// * `atom1` - The first atom participating in the interaction + /// * `atom2` - The second atom participating in the interaction + /// + /// # Return + /// + /// Returns the calculated VDW energy in kcal/mol. Positive values indicate repulsive + /// interactions, negative values indicate attractive interactions. + /// + /// # Errors + /// + /// Returns `EnergyCalculationError::UnparameterizedAtom` if either atom lacks + /// VDW parameters. pub fn calculate_vdw(atom1: &Atom, atom2: &Atom) -> Result { let extract_params = |atom: &Atom| match atom.vdw_param { CachedVdwParam::LennardJones { radius, well_depth } => Ok((radius, well_depth, 0.0)), @@ -37,6 +76,7 @@ impl EnergyCalculator { let dist = (atom1.position - atom2.position).norm(); + // Combine delta values to determine the flat-bottom cutoff distance let total_delta = (atom1.delta.powi(2) + atom2.delta.powi(2)).sqrt(); let r_min_combined = (r_min1 + r_min2) / 2.0; let well_depth_combined = (well_depth1 * well_depth2).sqrt(); @@ -54,11 +94,46 @@ impl EnergyCalculator { Ok(energy) } + /// Calculates the electrostatic Coulomb interaction energy between two atoms. + /// + /// This method computes the electrostatic energy using Coulomb's law with a distance- + /// dependent dielectric constant. The calculation assumes point charges and does not + /// include any cutoff or screening effects beyond the dielectric. + /// + /// # Arguments + /// + /// * `atom1` - The first atom participating in the interaction + /// * `atom2` - The second atom participating in the interaction + /// * `dielectric` - The dielectric constant of the medium (dimensionless) + /// + /// # Return + /// + /// Returns the calculated Coulomb energy in kcal/mol. The sign depends on the + /// charges: positive for like charges, negative for opposite charges. pub fn calculate_coulomb(atom1: &Atom, atom2: &Atom, dielectric: f64) -> f64 { let dist = (atom1.position - atom2.position).norm(); potentials::coulomb(dist, atom1.partial_charge, atom2.partial_charge, dielectric) } + /// Calculates the hydrogen bonding interaction energy between donor, hydrogen, and acceptor atoms. + /// + /// This method implements the Dreiding hydrogen bond potential with a 12-10 distance + /// dependence and an angular term based on the donor-hydrogen-acceptor angle. The + /// potential is zero for angles ≤ 90° and includes a flat-bottom modification for + /// close contacts based on the atoms' delta values. + /// + /// # Arguments + /// + /// * `donor` - The donor atom in the hydrogen bond + /// * `hydrogen` - The hydrogen atom attached to the donor + /// * `acceptor` - The acceptor atom in the hydrogen bond + /// * `r_hb` - The equilibrium hydrogen bond distance in Å + /// * `d_hb` - The well depth of the hydrogen bond potential in kcal/mol + /// + /// # Return + /// + /// Returns the calculated hydrogen bond energy in kcal/mol. Negative values indicate + /// favorable hydrogen bonding interactions. pub fn calculate_hbond( donor: &Atom, hydrogen: &Atom, @@ -71,10 +146,12 @@ impl EnergyCalculator { let v_hd = donor.position - hydrogen.position; let angle_ahd_deg = v_ha.angle(&v_hd).to_degrees(); + // Hydrogen bonds are only considered for angles > 90° if angle_ahd_deg <= 90.0 { return 0.0; } + // Combine delta values for flat-bottom cutoff let total_delta = (acceptor.delta.powi(2) + donor.delta.powi(2)).sqrt(); let distance_potential_fn = |d: f64| -> f64 { potentials::dreiding_hbond_12_10(d, r_hb, d_hb) }; From f592f1577c52f9740eab11fd978f97c53e90275d Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:08:40 -0700 Subject: [PATCH 27/83] docs(core): Add documentation for parameterization process and error handling --- .../src/core/forcefield/parameterization.rs | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/parameterization.rs b/crates/scream-core/src/core/forcefield/parameterization.rs index da003015..7965bc77 100644 --- a/crates/scream-core/src/core/forcefield/parameterization.rs +++ b/crates/scream-core/src/core/forcefield/parameterization.rs @@ -13,40 +13,90 @@ use std::collections::{HashMap, HashSet}; use thiserror::Error; use tracing::warn; +/// Defines the force field type identifier for hydrogen atoms that can participate as donors in hydrogen bonding. +/// +/// This constant represents the specific force field type used to identify hydrogen atoms +/// that are capable of forming hydrogen bonds as donors in the Dreiding force field. const DREIDING_HBOND_DONOR_HYDROGEN: &str = "H___A"; +/// Represents errors that can occur during the parameterization process. +/// +/// This enum encapsulates various error conditions that may arise when assigning +/// force field parameters to molecular systems or rotamers, providing detailed +/// context for debugging and error handling in molecular simulations. #[derive(Debug, Error, PartialEq, Eq)] pub enum ParameterizationError { + /// Indicates that a required van der Waals parameter is missing for a specific force field type. + /// + /// This error occurs when an atom's force field type is defined but no corresponding + /// VDW parameters (Lennard-Jones or Buckingham) are found in the force field data. #[error( "Missing VDW parameter for force field type: '{ff_type}' in atom '{atom_name}' of residue {residue_name}" )] MissingVdwParams { + /// The force field type that is missing parameters. ff_type: String, + /// The name of the atom with missing parameters. atom_name: String, + /// The name of the residue containing the atom. residue_name: String, }, + /// Indicates that a required anchor atom is missing or misclassified in the residue topology. + /// + /// This error occurs when the topology definition specifies an anchor atom that is either + /// not present in the system or has been incorrectly classified as a sidechain atom. #[error( "Missing or misclassified anchor atom in residue '{residue_name}': Cannot find required anchor atom '{atom_name}', or it was incorrectly defined as a sidechain atom in the topology." )] InvalidAnchorAtom { + /// The name of the residue with the invalid anchor atom. residue_name: String, + /// The name of the missing or misclassified anchor atom. atom_name: String, }, } +/// Handles the assignment of force field parameters to molecular systems and rotamers. +/// +/// This struct provides methods to parameterize atoms with their physicochemical properties, +/// including van der Waals parameters, delta values for flat-bottom potentials, and hydrogen +/// bonding classifications. It uses topology information to assign atom roles and force field +/// data to compute interaction parameters. pub struct Parameterizer<'a> { + /// Reference to the force field containing parameter definitions. forcefield: &'a Forcefield, + /// Reference to the topology registry for residue definitions. topology_registry: &'a TopologyRegistry, + /// Scaling factor for the sigma component of delta parameters. delta_s_factor: f64, } +/// Internal structure for storing calculated atom parameters before applying them. +/// +/// This struct holds the computed parameters for an atom during the parameterization +/// process, allowing for a two-phase approach where parameters are calculated first +/// and then applied to avoid borrowing conflicts. struct CalculatedAtomParams { + /// The delta value for flat-bottom potential modifications. delta: f64, + /// The cached van der Waals parameter for the atom. vdw_param: CachedVdwParam, + /// The hydrogen bonding classification ID (-1: none, 0: donor, 1: acceptor). hbond_type_id: i8, } impl<'a> Parameterizer<'a> { + /// Creates a new parameterizer with the given force field and topology registry. + /// + /// # Arguments + /// + /// * `forcefield` - The force field containing parameter definitions + /// * `topology_registry` - The registry of residue topologies + /// * `delta_s_factor` - Scaling factor for delta sigma values + /// + /// # Return + /// + /// Returns a new `Parameterizer` instance configured with the provided references. pub fn new( forcefield: &'a Forcefield, topology_registry: &'a TopologyRegistry, @@ -59,6 +109,26 @@ impl<'a> Parameterizer<'a> { } } + /// Parameterizes all atoms in a molecular system with force field properties. + /// + /// This method performs a complete parameterization of the system in two passes: + /// first assigning atom roles based on topology, then computing physicochemical + /// parameters for each atom. The two-pass approach ensures that mutable borrowing + /// conflicts are avoided during parameter calculation. + /// + /// # Arguments + /// + /// * `system` - The molecular system to parameterize (modified in place) + /// + /// # Return + /// + /// Returns `Ok(())` if parameterization succeeds, or an error if required parameters + /// or topology information is missing. + /// + /// # Errors + /// + /// Returns `ParameterizationError::InvalidAnchorAtom` if required anchor atoms + /// are missing from any residue. pub fn parameterize_system( &self, system: &mut MolecularSystem, @@ -98,6 +168,27 @@ impl<'a> Parameterizer<'a> { Ok(()) } + /// Parameterizes a rotamer with force field properties. + /// + /// This method assigns atom roles based on the provided topology and computes + /// physicochemical parameters for each atom in the rotamer. It is designed for + /// use with individual rotamer conformations during side-chain placement. + /// + /// # Arguments + /// + /// * `rotamer` - The rotamer to parameterize (modified in place) + /// * `residue_name` - The name of the residue this rotamer represents + /// * `topology` - The topology definition for the residue + /// + /// # Return + /// + /// Returns `Ok(())` if parameterization succeeds, or an error if required parameters + /// or topology information is missing. + /// + /// # Errors + /// + /// Returns `ParameterizationError::InvalidAnchorAtom` if required anchor atoms + /// are missing from the rotamer. pub fn parameterize_rotamer( &self, rotamer: &mut Rotamer, @@ -129,6 +220,27 @@ impl<'a> Parameterizer<'a> { Ok(()) } + /// Assigns atom roles for a specific residue in the molecular system. + /// + /// This method determines the role (backbone, sidechain, etc.) of each atom in a residue + /// based on the chain type and available topology information. For protein residues, + /// it uses topology definitions to classify atoms; for other chain types, it assigns + /// roles based on the chain category. + /// + /// # Arguments + /// + /// * `residue_id` - The ID of the residue to parameterize + /// * `system` - The molecular system containing the residue + /// + /// # Return + /// + /// Returns `Ok(())` if role assignment succeeds, or an error if required topology + /// information is missing. + /// + /// # Errors + /// + /// Returns `ParameterizationError::InvalidAnchorAtom` if required anchor atoms + /// are missing from the residue. fn assign_atom_roles_for_residue( &self, residue_id: ResidueId, @@ -187,6 +299,25 @@ impl<'a> Parameterizer<'a> { Ok(()) } + /// Calculates the core physicochemical parameters for an atom. + /// + /// This method computes the delta value (for flat-bottom potentials) and van der Waals + /// parameters for a single atom based on its force field type and residue-specific + /// delta parameters. + /// + /// # Arguments + /// + /// * `atom` - The atom to parameterize + /// * `residue_name` - The name of the residue containing the atom + /// + /// # Return + /// + /// Returns a tuple of `(delta, vdw_param)` if calculation succeeds. + /// + /// # Errors + /// + /// This method does not return errors directly, but the VDW parameter lookup + /// may result in `CachedVdwParam::None` if the force field type is not found. fn calculate_core_params( &self, atom: &Atom, @@ -212,6 +343,22 @@ impl<'a> Parameterizer<'a> { Ok((delta, vdw_param)) } + /// Determines the hydrogen bonding role for an atom in the molecular system. + /// + /// This method analyzes the atom's force field type and its bonding environment + /// to classify it as a hydrogen bond donor, acceptor, or neither. + /// + /// # Arguments + /// + /// * `atom_id` - The ID of the atom to classify + /// * `system` - The molecular system containing the atom + /// + /// # Return + /// + /// Returns an integer indicating the hydrogen bonding role: + /// - `-1`: Not involved in hydrogen bonding + /// - `0`: Hydrogen bond donor + /// - `1`: Hydrogen bond acceptor fn determine_hbond_role_for_system_atom( &self, atom_id: AtomId, @@ -231,6 +378,22 @@ impl<'a> Parameterizer<'a> { }) } + /// Determines the hydrogen bonding role for an atom in a rotamer. + /// + /// This method analyzes the atom's force field type and its bonding environment + /// within the rotamer to classify it as a hydrogen bond donor, acceptor, or neither. + /// + /// # Arguments + /// + /// * `atom_index` - The index of the atom in the rotamer's atom list + /// * `rotamer` - The rotamer containing the atom + /// + /// # Return + /// + /// Returns an integer indicating the hydrogen bonding role: + /// - `-1`: Not involved in hydrogen bonding + /// - `0`: Hydrogen bond donor + /// - `1`: Hydrogen bond acceptor fn determine_hbond_role_for_rotamer_atom(&self, atom_index: usize, rotamer: &Rotamer) -> i8 { let atom = &rotamer.atoms[atom_index]; self.determine_hbond_role(&atom.force_field_type, || { @@ -256,6 +419,24 @@ impl<'a> Parameterizer<'a> { }) } + /// Determines the hydrogen bonding role based on force field type and neighbor information. + /// + /// This generic method implements the logic for classifying atoms as hydrogen bond + /// donors or acceptors based on their force field types and the types of their bonded + /// neighbors. It handles the special case of donor hydrogens that require checking + /// the heavy atom they are attached to. + /// + /// # Arguments + /// + /// * `ff_type` - The force field type of the atom + /// * `get_neighbor_ff_type` - A closure that returns the force field type of the bonded neighbor + /// + /// # Return + /// + /// Returns an integer indicating the hydrogen bonding role: + /// - `-1`: Not involved in hydrogen bonding + /// - `0`: Hydrogen bond donor + /// - `1`: Hydrogen bond acceptor fn determine_hbond_role<'b, F>(&self, ff_type: &str, get_neighbor_ff_type: F) -> i8 where F: Fn() -> Option<&'b str>, @@ -285,6 +466,30 @@ impl<'a> Parameterizer<'a> { } } +/// Assigns atom roles to a collection of atoms based on residue topology. +/// +/// This function implements a multi-step algorithm to classify atoms as backbone or +/// sidechain based on topology definitions. It uses a pool-based approach to handle +/// cases where multiple atoms may have the same name, ensuring proper assignment +/// by consuming atoms from different ends of the pool for anchor vs sidechain atoms. +/// +/// # Arguments +/// +/// * `atoms` - Mutable slice of atoms to assign roles to +/// * `get_name` - Function to extract the atom name from an atom +/// * `set_role` - Function to set the role on an atom +/// * `topology` - The topology definition for the residue +/// * `residue_name` - The name of the residue for error reporting +/// +/// # Return +/// +/// Returns `Ok(())` if role assignment succeeds, or an error if required anchor atoms +/// are missing. +/// +/// # Errors +/// +/// Returns `ParameterizationError::InvalidAnchorAtom` if any required anchor atom +/// is missing from the atom collection. fn assign_protein_roles_from_pool<'a, T>( atoms: &'a mut [T], get_name: impl for<'b> Fn(&'b T) -> &'b str, @@ -348,6 +553,11 @@ fn assign_protein_roles_from_pool<'a, T>( Ok(()) } +/// Converts a force field VDW parameter into a cached parameter for efficient computation. +/// +/// This implementation provides a direct conversion from the configuration-based +/// `VdwParam` enum to the runtime-optimized `CachedVdwParam` enum used during +/// energy calculations. impl From for CachedVdwParam { fn from(param: super::params::VdwParam) -> Self { match param { From a626c32da7c6a4023a2899091f7ad3fc04e16dc2 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:14:35 -0700 Subject: [PATCH 28/83] docs(core): Add documentation for scoring errors and energy calculation methods --- .../src/core/forcefield/scoring.rs | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/scoring.rs b/crates/scream-core/src/core/forcefield/scoring.rs index ab3a1351..1b1a851a 100644 --- a/crates/scream-core/src/core/forcefield/scoring.rs +++ b/crates/scream-core/src/core/forcefield/scoring.rs @@ -6,29 +6,87 @@ use crate::core::models::system::MolecularSystem; use std::collections::HashSet; use thiserror::Error; +/// Represents errors that can occur during energy scoring operations. +/// +/// This enum encapsulates various error conditions that may arise when computing +/// interaction energies between atoms or groups of atoms, providing detailed +/// context for debugging and error handling in molecular simulations. #[derive(Debug, Error)] pub enum ScoringError { + /// Indicates that an atom with the specified ID was not found in the molecular system. + /// + /// This error occurs when attempting to access an atom that does not exist in the system, + /// which may happen due to invalid atom IDs or system state inconsistencies. #[error("Atom with ID {0:?} not found in the system")] AtomNotFound(AtomId), + /// Indicates that the donor atom for a hydrogen bond could not be identified. + /// + /// This error occurs when processing hydrogen bonding interactions where the donor + /// atom (typically the heavy atom attached to the hydrogen) cannot be found or + /// is not properly bonded to the hydrogen atom. #[error("Could not find donor for hydrogen atom {0:?}")] DonorNotFound(AtomId), + /// Indicates that an underlying energy calculation failed. + /// + /// This error wraps errors from the energy calculation module, providing context + /// about which specific energy computation encountered a problem. #[error("Energy calculation failed: {source}")] EnergyCalculation { + /// The underlying energy calculation error. #[from] source: EnergyCalculationError, }, } +/// Provides methods for calculating interaction energies between atoms and groups of atoms. +/// +/// This struct serves as the main interface for computing molecular mechanics energies, +/// including van der Waals, electrostatic, and hydrogen bonding interactions. It uses +/// the molecular system topology and force field parameters to evaluate energies +/// between specified groups of atoms, with proper handling of bonded exclusions. pub struct Scorer<'a> { + /// Reference to the molecular system containing the atoms and their topology. system: &'a MolecularSystem, + /// Reference to the force field containing energy parameters and weights. forcefield: &'a Forcefield, } impl<'a> Scorer<'a> { + /// Creates a new scorer with the given molecular system and force field. + /// + /// # Arguments + /// + /// * `system` - The molecular system to score interactions in + /// * `forcefield` - The force field containing energy parameters + /// + /// # Return + /// + /// Returns a new `Scorer` instance configured with the provided references. pub fn new(system: &'a MolecularSystem, forcefield: &'a Forcefield) -> Self { Self { system, forcefield } } + /// Calculates the total interaction energy between two groups of atoms. + /// + /// This method computes the complete energy contribution from van der Waals, + /// electrostatic, and hydrogen bonding interactions between the specified + /// query and environment atom groups. It automatically handles exclusions + /// for bonded atoms and applies appropriate energy weighting. + /// + /// # Arguments + /// + /// * `query_atom_ids` - IDs of atoms in the first group + /// * `environment_atom_ids` - IDs of atoms in the second group + /// + /// # Return + /// + /// Returns an `EnergyTerm` containing the total energy breakdown, or an error + /// if any atoms are not found or energy calculations fail. + /// + /// # Errors + /// + /// Returns `ScoringError::AtomNotFound` if any specified atom ID is invalid. + /// Returns `ScoringError::EnergyCalculation` if underlying energy computations fail. pub fn score_interaction( &self, query_atom_ids: &[AtomId], @@ -39,12 +97,52 @@ impl<'a> Scorer<'a> { Ok(energy) } + /// Calculates the internal energy of a group of atoms. + /// + /// This method computes the energy contributions within a single group of atoms, + /// excluding interactions between directly bonded atoms (1-2 and 1-3 interactions) + /// while including 1-4 and higher-order interactions. This is typically used for + /// evaluating the internal energy of molecular fragments or residues. + /// + /// # Arguments + /// + /// * `group_ids` - IDs of atoms in the group to evaluate + /// + /// # Return + /// + /// Returns an `EnergyTerm` containing the internal energy breakdown, or an error + /// if any atoms are not found or energy calculations fail. + /// + /// # Errors + /// + /// Returns `ScoringError::AtomNotFound` if any specified atom ID is invalid. + /// Returns `ScoringError::EnergyCalculation` if underlying energy computations fail. pub fn score_group_internal(&self, group_ids: &[AtomId]) -> Result { let mut energy = self.score_vdw_coulomb(group_ids, group_ids)?; energy += self.score_hbond(group_ids, group_ids)?; Ok(energy) } + /// Calculates van der Waals and Coulomb energies between two atom groups. + /// + /// This method computes non-bonded interactions between atoms in two groups, + /// automatically excluding 1-2 and 1-3 bonded interactions to prevent double-counting + /// of bonded terms. For internal energy calculations (same group), it also avoids + /// computing symmetric pairs twice. + /// + /// # Arguments + /// + /// * `group1_ids` - IDs of atoms in the first group + /// * `group2_ids` - IDs of atoms in the second group + /// + /// # Return + /// + /// Returns an `EnergyTerm` containing VDW and Coulomb energy contributions. + /// + /// # Errors + /// + /// Returns `ScoringError::AtomNotFound` if any atom ID is invalid. + /// Returns `ScoringError::EnergyCalculation` if energy computations fail. fn score_vdw_coulomb( &self, group1_ids: &[AtomId], @@ -106,6 +204,26 @@ impl<'a> Scorer<'a> { Ok(energy) } + /// Calculates hydrogen bonding energies between two atom groups. + /// + /// This method identifies potential hydrogen bond donor-acceptor pairs between + /// the two groups and computes their interaction energies. For internal calculations + /// (same group), it avoids double-counting by computing each pair only once. + /// + /// # Arguments + /// + /// * `group1_ids` - IDs of atoms in the first group + /// * `group2_ids` - IDs of atoms in the second group + /// + /// # Return + /// + /// Returns an `EnergyTerm` containing hydrogen bonding energy contributions. + /// + /// # Errors + /// + /// Returns `ScoringError::AtomNotFound` if any atom ID is invalid. + /// Returns `ScoringError::DonorNotFound` if a hydrogen donor cannot be identified. + /// Returns `ScoringError::EnergyCalculation` if energy computations fail. fn score_hbond( &self, group1_ids: &[AtomId], @@ -123,6 +241,27 @@ impl<'a> Scorer<'a> { Ok(energy) } + /// Calculates hydrogen bonding energies in one direction between donor and acceptor groups. + /// + /// This method iterates through potential hydrogen donors in the first group and + /// potential acceptors in the second group, computing hydrogen bond energies + /// for valid donor-hydrogen-acceptor triplets. It ensures that donors and acceptors + /// are properly classified and that appropriate force field parameters exist. + /// + /// # Arguments + /// + /// * `donor_group_ids` - IDs of atoms that may act as hydrogen donors + /// * `acceptor_group_ids` - IDs of atoms that may act as hydrogen acceptors + /// + /// # Return + /// + /// Returns an `EnergyTerm` containing hydrogen bonding energy contributions. + /// + /// # Errors + /// + /// Returns `ScoringError::AtomNotFound` if any atom ID is invalid. + /// Returns `ScoringError::DonorNotFound` if a hydrogen donor cannot be identified. + /// Returns `ScoringError::EnergyCalculation` if energy computations fail. fn calculate_hbond_one_way( &self, donor_group_ids: &[AtomId], From 6d4d72944e84eed7e422066614632f30e226f299 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:15:55 -0700 Subject: [PATCH 29/83] docs(core): Add documentation for the Force Field module, including usage examples and key components --- crates/scream-core/src/core/forcefield/mod.rs | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/crates/scream-core/src/core/forcefield/mod.rs b/crates/scream-core/src/core/forcefield/mod.rs index 640f404c..484178e0 100644 --- a/crates/scream-core/src/core/forcefield/mod.rs +++ b/crates/scream-core/src/core/forcefield/mod.rs @@ -1,3 +1,42 @@ +//! # Force Field Module +//! +//! This module provides the core functionality for molecular mechanics force field calculations +//! in the SCREAM++ protein side-chain placement library. It implements energy evaluation, +//! parameter management, and scoring algorithms for molecular interactions. +//! +//! ## Overview +//! +//! The force field module is responsible for computing interaction energies between atoms +//! and molecular groups using classical molecular mechanics potentials. It supports: +//! +//! - **Van der Waals interactions** using Lennard-Jones and Buckingham potentials +//! - **Electrostatic interactions** with Coulomb's law and distance-dependent dielectric +//! - **Hydrogen bonding** using specialized 12-10 potentials +//! - **Energy weighting** based on atom roles (backbone vs sidechain) +//! - **Parameter management** for different force field types and atom types +//! +//! ## Key Components +//! +//! - [`energy`] - Core energy calculation functions for different potential types +//! - [`params`] - Force field parameter structures and configuration +//! - [`potentials`] - Potential energy function implementations +//! - [`scoring`] - High-level energy scoring interface for molecular systems +//! - [`term`] - Energy term aggregation and reporting +//! - [`parameterization`] - Automatic assignment of force field parameters to atoms +//! +//! ## Usage +//! +//! The main entry point for energy calculations is the [`scoring::Scorer`] struct, +//! which provides methods to compute interaction energies between atom groups while +//! properly handling bonded exclusions and energy weighting. +//! +//! ```ignore +//! use scream_core::core::forcefield::scoring::Scorer; +//! +//! let scorer = Scorer::new(&system, &forcefield); +//! let energy = scorer.score_interaction(query_atoms, environment_atoms)?; +//! ``` + pub(crate) mod energy; pub mod parameterization; pub mod params; From 0ca3b5fed79614b3afb12ddc8fa5067d3308c51f Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:17:38 -0700 Subject: [PATCH 30/83] docs(core): Update documentation for the Force Field module by removing private component references --- crates/scream-core/src/core/forcefield/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/scream-core/src/core/forcefield/mod.rs b/crates/scream-core/src/core/forcefield/mod.rs index 484178e0..59dcf7d9 100644 --- a/crates/scream-core/src/core/forcefield/mod.rs +++ b/crates/scream-core/src/core/forcefield/mod.rs @@ -17,9 +17,7 @@ //! //! ## Key Components //! -//! - [`energy`] - Core energy calculation functions for different potential types //! - [`params`] - Force field parameter structures and configuration -//! - [`potentials`] - Potential energy function implementations //! - [`scoring`] - High-level energy scoring interface for molecular systems //! - [`term`] - Energy term aggregation and reporting //! - [`parameterization`] - Automatic assignment of force field parameters to atoms From 013ce7f455f81ff302fd3f567a41f2122a8746fd Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:24:52 -0700 Subject: [PATCH 31/83] docs(core): Correct import path for Scorer in usage example of Force Field module --- crates/scream-core/src/core/forcefield/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/core/forcefield/mod.rs b/crates/scream-core/src/core/forcefield/mod.rs index 59dcf7d9..2daea733 100644 --- a/crates/scream-core/src/core/forcefield/mod.rs +++ b/crates/scream-core/src/core/forcefield/mod.rs @@ -29,7 +29,7 @@ //! properly handling bonded exclusions and energy weighting. //! //! ```ignore -//! use scream_core::core::forcefield::scoring::Scorer; +//! use screampp::core::forcefield::scoring::Scorer; //! //! let scorer = Scorer::new(&system, &forcefield); //! let energy = scorer.score_interaction(query_atoms, environment_atoms)?; From e10959be5fefee6d077cbe98a9a02e5732308d64 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:29:38 -0700 Subject: [PATCH 32/83] docs(core): Add documentation for TopologyRegistry and ResidueTopology structs --- .../scream-core/src/core/topology/registry.rs | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/crates/scream-core/src/core/topology/registry.rs b/crates/scream-core/src/core/topology/registry.rs index 0b88e0ee..9642eb15 100644 --- a/crates/scream-core/src/core/topology/registry.rs +++ b/crates/scream-core/src/core/topology/registry.rs @@ -3,24 +3,65 @@ use std::collections::HashMap; use std::path::Path; use thiserror::Error; +/// Represents the topological structure of an amino acid residue. +/// +/// This struct defines the atom composition and classification for a specific +/// residue type, distinguishing between anchor (backbone) atoms that define +/// the peptide chain connectivity and sidechain atoms that contribute to +/// the residue's unique chemical properties. #[derive(Debug, Deserialize, Clone, PartialEq, Eq)] #[serde(deny_unknown_fields)] pub struct ResidueTopology { + /// Names of atoms that serve as anchor points for the peptide backbone. + /// + /// These atoms (typically N, CA, C) define the connectivity and geometry + /// of the polypeptide chain and are conserved across most amino acids. pub anchor_atoms: Vec, + /// Names of atoms that belong to the residue's sidechain. + /// + /// These atoms vary between different amino acid types and determine + /// the chemical properties and functionality of each residue. pub sidechain_atoms: Vec, } +/// Manages a collection of residue topology definitions for molecular systems. +/// +/// This registry provides centralized access to topological information for different +/// amino acid residue types, enabling consistent atom classification and structural +/// analysis across the molecular modeling pipeline. #[derive(Debug, Clone, Default)] pub struct TopologyRegistry { + /// Internal storage mapping residue names to their topology definitions. registry: HashMap, } impl TopologyRegistry { + /// Loads residue topology definitions from a TOML configuration file. + /// + /// This method reads and parses a TOML file containing topology definitions + /// for various amino acid residues, populating the registry with the parsed data. + /// + /// # Arguments + /// + /// * `path` - Path to the TOML file containing topology definitions + /// + /// # Return + /// + /// Returns a new `TopologyRegistry` instance populated with the loaded topologies, + /// or an error if the file cannot be read or parsed. + /// + /// # Errors + /// + /// Returns `TopologyLoadError::Io` if the file cannot be read. + /// Returns `TopologyLoadError::Toml` if the file content is not valid TOML or + /// contains invalid topology definitions. pub fn load(path: &Path) -> Result { + // Read the entire file content into memory for TOML parsing let content = std::fs::read_to_string(path).map_err(|e| TopologyLoadError::Io { path: path.to_string_lossy().to_string(), source: e, })?; + // Parse the TOML content into a HashMap of residue topologies let registry: HashMap = toml::from_str(&content).map_err(|e| TopologyLoadError::Toml { path: path.to_string_lossy().to_string(), @@ -29,21 +70,50 @@ impl TopologyRegistry { Ok(Self { registry }) } + /// Retrieves the topology definition for a specific residue type. + /// + /// This method provides access to the topology information for a given + /// amino acid residue, allowing classification of its constituent atoms. + /// + /// # Arguments + /// + /// * `residue_name` - Three-letter code of the amino acid residue (e.g., "ALA", "GLY") + /// + /// # Return + /// + /// Returns `Some(&ResidueTopology)` if the residue is found in the registry, + /// or `None` if the residue type is not defined. pub fn get(&self, residue_name: &str) -> Option<&ResidueTopology> { self.registry.get(residue_name) } } +/// Represents errors that can occur when loading topology definitions from files. +/// +/// This enum encapsulates various failure modes during the topology loading process, +/// providing detailed context about what went wrong and where. #[derive(Debug, Error)] pub enum TopologyLoadError { + /// Indicates that the topology file could not be read from disk. + /// + /// This error occurs when there are permission issues, the file doesn't exist, + /// or other I/O-related problems prevent reading the topology configuration. #[error("File I/O error for '{path}': {source}")] Io { + /// The path to the file that could not be read. path: String, + /// The underlying I/O error that occurred. source: std::io::Error, }, + /// Indicates that the topology file content is not valid TOML or contains invalid data. + /// + /// This error occurs when the file exists but cannot be parsed as TOML, + /// or when the parsed data doesn't match the expected `ResidueTopology` structure. #[error("TOML parsing error for '{path}': {source}")] Toml { + /// The path to the file that could not be parsed. path: String, + /// The underlying TOML parsing error that occurred. source: toml::de::Error, }, } From eec2db88c8606e1ccc19a900edcba68ac456dabf Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:38:53 -0700 Subject: [PATCH 33/83] docs(core): Enhance documentation for Input/Output module with comprehensive details and usage examples --- crates/scream-core/src/core/io/mod.rs | 43 +++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/crates/scream-core/src/core/io/mod.rs b/crates/scream-core/src/core/io/mod.rs index 9cd19545..0543802b 100644 --- a/crates/scream-core/src/core/io/mod.rs +++ b/crates/scream-core/src/core/io/mod.rs @@ -1,9 +1,42 @@ -//! Provides input/output functionality for molecular file formats. +//! # Input/Output Module //! -//! This module contains implementations for reading and writing various molecular -//! structure file formats commonly used in computational chemistry and structural -//! biology. It provides a unified trait-based interface for file I/O operations -//! and includes utilities for canonical ordering of molecular components. +//! This module provides comprehensive input/output functionality for molecular file formats +//! used in computational chemistry and structural biology applications. +//! +//! ## Overview +//! +//! The I/O module enables SCREAM++ to read from and write to various molecular structure +//! file formats, providing a unified interface for molecular data exchange. It supports: +//! +//! - **File format parsing** - Reading molecular structures from standard formats +//! - **File format writing** - Exporting molecular systems to various output formats +//! - **Canonical ordering** - Consistent sorting of atoms and residues for reproducible output +//! - **Trait-based design** - Extensible interface for adding new file format support +//! +//! ## Key Components +//! +//! - [`bgf`] - Implementation for BGF (BioGraf) file format I/O +//! - [`traits`] - Common traits defining the molecular file I/O interface +//! +//! ## Usage +//! +//! The module provides a trait-based approach to file I/O, allowing different formats +//! to implement a common interface for reading and writing molecular structures. +//! +//! ```ignore +//! use screampp::core::io::{bgf::BgfFile, traits::MolecularFile}; +//! use std::fs::File; +//! use std::io::BufReader; +//! +//! // Read a molecular structure from a BGF file +//! let file = File::open("molecule.bgf")?; +//! let mut reader = BufReader::new(file); +//! let (system, metadata) = BgfFile::read_from(&mut reader)?; +//! +//! // Write a molecular system to a BGF file +//! let mut file = File::create("output.bgf")?; +//! BgfFile::write_system_to(&system, &mut file)?; +//! ``` pub mod bgf; pub(crate) mod sorting; From ae6070770ecddf6b51cb454087573da6f8044bd3 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:40:55 -0700 Subject: [PATCH 34/83] docs(core): Revise and expand documentation for Core Models module, enhancing clarity and structure --- crates/scream-core/src/core/models/mod.rs | 44 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/crates/scream-core/src/core/models/mod.rs b/crates/scream-core/src/core/models/mod.rs index da24b753..b8ba6213 100644 --- a/crates/scream-core/src/core/models/mod.rs +++ b/crates/scream-core/src/core/models/mod.rs @@ -1,9 +1,43 @@ -//! Core data models for molecular structures. +//! # Core Models Module //! -//! This module contains the fundamental data structures used to represent -//! molecular systems in SCREAM++, including atoms, residues, chains, and -//! their topological relationships. These models provide the foundation -//! for molecular modeling and simulation algorithms. +//! This module contains the fundamental data structures and models used to represent +//! molecular systems in SCREAM++, providing the foundation for all molecular modeling operations. +//! +//! ## Overview +//! +//! The models module defines the core abstractions for representing molecular structures, +//! including atoms, residues, chains, and their topological relationships. These models +//! are designed to: +//! +//! - **Represent molecular structure** - Complete description of atomic coordinates and connectivity +//! - **Support efficient operations** - Optimized data structures for computational algorithms +//! - **Enable extensibility** - Flexible design for different molecular types and properties +//! - **Maintain type safety** - Strong typing for molecular data integrity +//! +//! ## Key Components +//! +//! - [`atom`] - Individual atom representation with coordinates, types, and properties +//! - [`residue`] - Amino acid residue structure and classification +//! - [`chain`] - Polypeptide chain organization and metadata +//! - [`system`] - Complete molecular system with all components and relationships +//! - [`topology`] - Bond connectivity and molecular topology information +//! - [`ids`] - Unique identifier types for atoms, residues, and chains +//! +//! ## Usage +//! +//! The models form the backbone of molecular data representation in SCREAM++. +//! Most operations start with constructing or manipulating these core structures. +//! +//! ```ignore +//! use screampp::core::models::{system::MolecularSystem, atom::Atom}; +//! +//! let mut system = MolecularSystem::new(); +//! let chain_id = system.add_chain('A', ChainType::Protein); +//! let residue_id = system.add_residue(chain_id, 1, "ALA", None)?; +//! +//! let atom = Atom::new("CA", residue_id, Point3::new(0.0, 0.0, 0.0)); +//! system.add_atom_to_residue(residue_id, atom)?; +//! ``` pub mod atom; pub mod chain; From ff7aa0c3d194ce664854775f8d1e6e6ba25544a8 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:42:29 -0700 Subject: [PATCH 35/83] docs(core): Add detailed documentation for Topology module, including overview, key components, and usage examples --- crates/scream-core/src/core/topology/mod.rs | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/crates/scream-core/src/core/topology/mod.rs b/crates/scream-core/src/core/topology/mod.rs index d1089905..0b18ac47 100644 --- a/crates/scream-core/src/core/topology/mod.rs +++ b/crates/scream-core/src/core/topology/mod.rs @@ -1 +1,32 @@ +//! # Topology Module +//! +//! This module provides functionality for managing molecular topology information, +//! including residue structures and atom classifications for protein systems. +//! +//! ## Overview +//! +//! The topology module defines the structural organization of molecular systems, +//! particularly focusing on amino acid residues and their constituent atoms. +//! It provides data structures and utilities for: +//! +//! - **Residue topology definitions** - Specifying anchor and sidechain atoms for each residue type +//! - **Topology registry** - Loading and accessing residue topology information from configuration files +//! - **Atom classification** - Distinguishing between backbone and sidechain atoms +//! +//! ## Key Components +//! +//! - [`registry`] - Topology registry for loading and accessing residue definitions +//! +//! ## Usage +//! +//! Residue topologies are typically loaded from TOML configuration files and used +//! to classify atoms during molecular system construction and analysis. +//! +//! ```ignore +//! use screampp::core::topology::registry::TopologyRegistry; +//! +//! let registry = TopologyRegistry::load("topology.toml")?; +//! let ala_topology = registry.get("ALA").unwrap(); +//! ``` + pub mod registry; From 68cb860af814233d513a50e7a00a94651be92a81 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:50:54 -0700 Subject: [PATCH 36/83] docs(core): Enhance RotamerAtomData and RotamerData structs with additional fields and detailed documentation --- .../scream-core/src/core/rotamers/rotamer.rs | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/crates/scream-core/src/core/rotamers/rotamer.rs b/crates/scream-core/src/core/rotamers/rotamer.rs index e48ade3c..a48bbd13 100644 --- a/crates/scream-core/src/core/rotamers/rotamer.rs +++ b/crates/scream-core/src/core/rotamers/rotamer.rs @@ -1,23 +1,87 @@ use crate::core::models::atom::Atom; use serde::Deserialize; +/// Represents atom data for a single atom in a rotamer, suitable for deserialization from external files. +/// +/// This struct contains the essential information needed to reconstruct an atom +/// within a protein side-chain rotamer, including its position, charge, and +/// force field classification. It serves as an intermediate representation +/// between serialized rotamer data and the full `Atom` structure used in molecular systems. #[derive(Debug, Clone, Deserialize)] pub struct RotamerAtomData { + /// The serial number of the atom within the rotamer. + /// + /// This corresponds to the atom's position in the rotamer's atom list + /// and is used for establishing connectivity relationships. pub serial: usize, + /// The name of the atom (e.g., "CA", "CB", "CG"). + /// + /// This follows standard PDB atom naming conventions and helps + /// identify the atom's role in the amino acid side chain. pub atom_name: String, + /// The partial charge of the atom in atomic units. + /// + /// This value is used in electrostatic energy calculations and + /// is typically derived from quantum mechanical calculations or + /// empirical force field parameters. pub partial_charge: f64, + /// The 3D coordinates of the atom in Cartesian space. + /// + /// Coordinates are stored as [x, y, z] in Angstroms and represent + /// the atom's position relative to the rotamer's local coordinate system. pub position: [f64; 3], + /// The force field atom type identifier. + /// + /// This string identifies the atom's type in the molecular mechanics + /// force field (e.g., "C_3", "N_R", "O_2") and determines which + /// parameters to use for energy calculations. pub force_field_type: String, } +/// Represents the complete data for a protein side-chain rotamer, suitable for deserialization. +/// +/// This struct encapsulates all the information needed to define a specific +/// conformational state of an amino acid side chain, including both atomic +/// properties and connectivity. Rotamers are discrete conformational states +/// that amino acid side chains can adopt, and this structure provides the +/// data needed to reconstruct them from external storage formats. #[derive(Debug, Clone, Deserialize)] pub struct RotamerData { + /// The atoms that make up this rotamer. + /// + /// Each atom contains its name, position, charge, and force field type. + /// The atoms are ordered by their serial numbers for consistent indexing. pub atoms: Vec, + /// The bonds connecting atoms within the rotamer. + /// + /// Each bond is represented as a pair of atom indices [atom1, atom2], + /// where the indices correspond to positions in the `atoms` vector. + /// Only bonds within the side chain are included (backbone connectivity + /// is handled separately). pub bonds: Vec<[usize; 2]>, } +/// Represents a protein side-chain rotamer in its runtime form. +/// +/// This struct contains the complete molecular representation of a specific +/// conformational state of an amino acid side chain, ready for use in +/// molecular modeling and energy calculations. Unlike `RotamerData`, this +/// structure uses the full `Atom` type from the molecular system, making +/// it suitable for integration with the broader molecular modeling framework. #[derive(Debug, Clone)] pub struct Rotamer { + /// The atoms that constitute this rotamer. + /// + /// Each atom is a complete `Atom` object with all necessary properties + /// for molecular modeling, including coordinates, charges, and force + /// field parameters. The atoms are ordered consistently for reliable + /// indexing and energy calculations. pub atoms: Vec, + /// The bonds connecting atoms within the rotamer. + /// + /// Each bond is represented as a pair of indices (atom1_idx, atom2_idx) + /// that reference positions in the `atoms` vector. These define the + /// connectivity of the side chain atoms and are essential for molecular + /// mechanics calculations and structural analysis. pub bonds: Vec<(usize, usize)>, } From d833e6bbd88d49ba72459b33940a7682c3465eff Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 16:59:55 -0700 Subject: [PATCH 37/83] docs(core): Expand documentation for RotamerLibrary and LibraryLoadError, detailing methods and error handling --- .../scream-core/src/core/rotamers/library.rs | 172 +++++++++++++++++- 1 file changed, 165 insertions(+), 7 deletions(-) diff --git a/crates/scream-core/src/core/rotamers/library.rs b/crates/scream-core/src/core/rotamers/library.rs index 88e0d312..b250b4e5 100644 --- a/crates/scream-core/src/core/rotamers/library.rs +++ b/crates/scream-core/src/core/rotamers/library.rs @@ -18,43 +18,106 @@ use std::path::Path; use std::str::FromStr; use thiserror::Error; +/// Type alias for the raw rotamer data loaded from TOML files. +/// +/// This represents the deserialized structure of rotamer library files, +/// mapping residue names to their corresponding rotamer data arrays. +/// Used internally during the loading process before conversion to +/// the runtime `Rotamer` format. type RawRotamerFile = HashMap>; +/// Manages a collection of pre-computed rotamer conformations for protein side-chain modeling. +/// +/// This library serves as the central repository for rotamer data in SCREAM++, +/// providing access to discrete conformational states of amino acid side chains. +/// Rotamers are pre-computed low-energy conformations that represent the most +/// probable side-chain orientations, enabling efficient protein structure prediction +/// and refinement algorithms. #[derive(Debug, Default, Clone)] pub struct RotamerLibrary { + /// The core storage mapping residue types to their available rotamers. + /// + /// Each residue type (e.g., Alanine, Glycine) is associated with a vector + /// of possible rotamer conformations. These rotamers are fully parameterized + /// and ready for use in molecular mechanics calculations. pub rotamers: HashMap>, } +/// Represents errors that can occur during rotamer library loading and processing. +/// +/// This enum encompasses all possible failure modes when loading rotamer data +/// from external files, validating topology consistency, and parameterizing +/// rotamers for molecular mechanics calculations. #[derive(Debug, Error)] pub enum LibraryLoadError { + /// Indicates that the rotamer library file could not be read from disk. + /// + /// This error occurs when there are permission issues, the file doesn't exist, + /// or other I/O-related problems prevent reading the rotamer configuration. #[error("File I/O error for '{path}': {source}")] Io { + /// The path to the file that could not be read. path: String, + /// The underlying I/O error that occurred. source: std::io::Error, }, + /// Indicates that the rotamer library file content is not valid TOML. + /// + /// This error occurs when the file exists but cannot be parsed as TOML, + /// or when the parsed data doesn't match the expected `RotamerData` structure. #[error("TOML parsing error for '{path}': {source}")] Toml { + /// The path to the file that could not be parsed. path: String, + /// The underlying TOML parsing error that occurred. source: toml::de::Error, }, + /// Indicates that an unknown residue type was encountered in the rotamer file. + /// + /// This error occurs when the rotamer library contains a residue name that + /// cannot be mapped to a known `ResidueType` enum variant, indicating + /// either a typo or missing residue type definition. #[error("Unknown residue type '{0}' found in library file")] UnknownResidueType(String), + /// Indicates that a residue type in the rotamer library lacks a topology definition. + /// + /// This error occurs when rotamer data exists for a residue type, but no + /// corresponding topology information is available in the topology registry. + /// Topology definitions are required for proper atom classification and parameterization. #[error( "Missing topology definition for residue type '{0}', which is present in the rotamer library" )] MissingTopology(String), + /// Indicates that parameterization of a rotamer failed during loading. + /// + /// This error occurs when the force field parameterization process encounters + /// issues, such as missing parameters or invalid atom types, preventing + /// the rotamer from being used in energy calculations. #[error( "Parameterization failed for residue '{residue_type}' in rotamer from file '{path}': {source}" )] Parameterization { + /// The path to the rotamer file being processed. path: String, + /// The residue type that failed parameterization. residue_type: String, + /// The underlying parameterization error that occurred. source: ParameterizationError, }, + /// Indicates that a bond definition references a non-existent atom serial. + /// + /// This error occurs when a bond in the rotamer data refers to an atom + /// serial number that doesn't exist in the atom list, indicating corrupted + /// or malformed rotamer data. #[error( "Invalid bond definition in rotamer library for residue '{residue_type}': bond references non-existent atom serial '{serial}'" )] InvalidBondSerial { residue_type: String, serial: usize }, + /// Indicates that duplicate atom serial numbers were found in a rotamer definition. + /// + /// This error occurs when multiple atoms in the same rotamer have the same + /// serial number, which would cause conflicts in bond definitions and + /// atom indexing. #[error( "Duplicate atom serial '{serial}' found in rotamer definition for residue '{residue_type}'" )] @@ -62,13 +125,41 @@ pub enum LibraryLoadError { } impl RotamerLibrary { + /// Loads and parameterizes a rotamer library from a TOML configuration file. + /// + /// This method performs a multi-phase process to load rotamer data, validate + /// consistency with topology definitions, and parameterize rotamers for use + /// in molecular mechanics calculations. The process includes error checking + /// and cross-validation to ensure data integrity. + /// + /// # Arguments + /// + /// * `rotamer_toml_path` - Path to the TOML file containing rotamer definitions + /// * `topology_registry` - Registry containing residue topology information + /// * `forcefield` - Force field parameters for rotamer parameterization + /// * `delta_s_factor` - Scaling factor for delta parameters in parameterization + /// + /// # Return + /// + /// Returns a fully loaded and parameterized `RotamerLibrary` ready for use, + /// or an error if loading, validation, or parameterization fails. + /// + /// # Errors + /// + /// Returns `LibraryLoadError::Io` if the file cannot be read. + /// Returns `LibraryLoadError::Toml` if the file is not valid TOML. + /// Returns `LibraryLoadError::UnknownResidueType` if unknown residues are found. + /// Returns `LibraryLoadError::MissingTopology` if topology definitions are missing. + /// Returns `LibraryLoadError::Parameterization` if parameterization fails. + /// Returns `LibraryLoadError::InvalidBondSerial` if bond definitions are invalid. + /// Returns `LibraryLoadError::DuplicateAtomSerial` if duplicate serials exist. pub fn load( rotamer_toml_path: &Path, topology_registry: &TopologyRegistry, forcefield: &Forcefield, delta_s_factor: f64, ) -> Result { - // --- Phase 1: Load raw rotamer data from TOML --- + // Phase 1: Load raw rotamer data from TOML file let content = std::fs::read_to_string(rotamer_toml_path).map_err(|e| LibraryLoadError::Io { path: rotamer_toml_path.to_string_lossy().to_string(), @@ -80,11 +171,11 @@ impl RotamerLibrary { source: e, })?; - // --- Phase 2: Create a parameterizer for pre-parameterizing rotamers --- + // Phase 2: Create parameterizer for pre-parameterizing rotamers let parameterizer = Parameterizer::new(forcefield, topology_registry, delta_s_factor); let mut final_rotamers_map = HashMap::new(); - // --- Phase 3: Process and parameterize each rotamer --- + // Phase 3: Process and parameterize each rotamer for (res_name, raw_rotamer_list) in raw_lib { let residue_type = ResidueType::from_str(&res_name) .map_err(|_| LibraryLoadError::UnknownResidueType(res_name.clone()))?; @@ -112,6 +203,30 @@ impl RotamerLibrary { }) } + /// Processes and parameterizes a single raw rotamer from the library file. + /// + /// This method converts raw rotamer data into a fully parameterized `Rotamer` + /// structure suitable for molecular mechanics calculations. It handles atom + /// creation, bond validation, and force field parameterization. + /// + /// # Arguments + /// + /// * `raw_rotamer_data` - The raw rotamer data from the TOML file + /// * `parameterizer` - The parameterizer configured for this library + /// * `res_name` - Name of the residue type being processed + /// * `topology` - Topology definition for the residue + /// * `path_for_error` - File path for error reporting + /// + /// # Return + /// + /// Returns a fully parameterized `Rotamer` ready for use, or an error + /// if processing or parameterization fails. + /// + /// # Errors + /// + /// Returns `LibraryLoadError::InvalidBondSerial` if bond references invalid atoms. + /// Returns `LibraryLoadError::DuplicateAtomSerial` if duplicate serials exist. + /// Returns `LibraryLoadError::Parameterization` if parameterization fails. fn process_raw_rotamer( raw_rotamer_data: &RotamerData, parameterizer: &Parameterizer, @@ -175,10 +290,37 @@ impl RotamerLibrary { Ok(rotamer) } + /// Retrieves all available rotamers for a specific residue type. + /// + /// This method provides access to the pre-computed rotamer conformations + /// for a given amino acid type, enabling side-chain placement algorithms + /// to evaluate different conformational possibilities. + /// + /// # Arguments + /// + /// * `residue_type` - The type of amino acid residue to get rotamers for + /// + /// # Return + /// + /// Returns `Some(&Vec)` containing all rotamers for the residue type, + /// or `None` if no rotamers are available for the requested type. pub fn get_rotamers_for(&self, residue_type: ResidueType) -> Option<&Vec> { self.rotamers.get(&residue_type) } + /// Extracts and includes rotamer conformations from existing molecular system residues. + /// + /// This method allows incorporating experimentally determined or user-provided + /// conformations from a molecular system into the rotamer library. It extracts + /// side-chain conformations from specified residues and adds them as additional + /// rotamer options for the corresponding residue types. + /// + /// # Arguments + /// + /// * `system` - The molecular system containing the residues to extract from + /// * `active_residues` - Set of residue IDs to extract conformations from + /// * `topology_registry` - Registry for residue topology information + /// * `parameterizer` - Parameterizer for force field parameterization pub fn include_system_conformations( &mut self, system: &MolecularSystem, @@ -211,6 +353,22 @@ impl RotamerLibrary { } } + /// Extracts a complete rotamer conformation from a molecular system residue. + /// + /// This method reconstructs a rotamer from the atoms and bonds of a specific + /// residue in a molecular system, following the topology definition to ensure + /// all required atoms are present and properly classified. + /// + /// # Arguments + /// + /// * `system` - The molecular system containing the residue + /// * `residue_id` - ID of the residue to extract the rotamer from + /// * `topology_registry` - Registry containing topology definitions + /// + /// # Return + /// + /// Returns `Some(Rotamer)` if extraction succeeds, or `None` if the residue + /// is missing required atoms or topology information. fn extract_rotamer_from_system( &self, system: &MolecularSystem, @@ -220,7 +378,7 @@ impl RotamerLibrary { let residue = system.residue(residue_id)?; let topology = topology_registry.get(&residue.name)?; - // --- Step 1: Build the consuming pool of atoms from the source residue --- + // Step 1: Build the consuming pool of atoms from the source residue let mut atom_pool: HashMap> = HashMap::new(); for &atom_id in residue.atoms() { if let Some(atom) = system.atom(atom_id) { @@ -235,7 +393,7 @@ impl RotamerLibrary { let mut old_id_to_new_index = HashMap::new(); let mut consumed_atom_ids = HashSet::new(); - // --- Step 2: Extract ANCHOR atoms (Mandatory) --- + // Step 2: Extract ANCHOR atoms (Mandatory) for anchor_name in &topology.anchor_atoms { match atom_pool.get_mut(anchor_name) { Some(ids) if !ids.is_empty() => { @@ -259,7 +417,7 @@ impl RotamerLibrary { } } - // --- Step 3: Extract SIDECHAIN atoms (Mandatory) --- + // Step 3: Extract SIDECHAIN atoms (Mandatory) for sidechain_name in &topology.sidechain_atoms { match atom_pool.get_mut(sidechain_name) { Some(ids) if !ids.is_empty() => { @@ -283,7 +441,7 @@ impl RotamerLibrary { } } - // --- Step 4: Reconstruct bonds for the new, complete rotamer --- + // Step 4: Reconstruct bonds for the new, complete rotamer let mut new_rotamer_bonds = Vec::new(); for (&old_id_a, &new_idx_a) in &old_id_to_new_index { if let Some(neighbors) = system.get_bonded_neighbors(old_id_a) { From b3209556143f256b2ea13b8515d38053abf4aab1 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:02:50 -0700 Subject: [PATCH 38/83] docs(core): Add comprehensive documentation for Rotamers module, including overview, key components, scientific background, and usage examples --- crates/scream-core/src/core/rotamers/mod.rs | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/crates/scream-core/src/core/rotamers/mod.rs b/crates/scream-core/src/core/rotamers/mod.rs index ca1faf3a..95a556d7 100644 --- a/crates/scream-core/src/core/rotamers/mod.rs +++ b/crates/scream-core/src/core/rotamers/mod.rs @@ -1,2 +1,42 @@ +//! # Rotamers Module +//! +//! This module provides comprehensive functionality for managing protein side-chain rotamers +//! in SCREAM++, enabling efficient and accurate protein structure prediction and refinement. +//! +//! ## Overview +//! +//! The rotamers module implements support for discrete conformational states of amino acid +//! side chains, which are crucial for protein structure modeling. Rotamers represent the +//! most probable orientations of side chains around their rotatable bonds, based on +//! statistical analysis of known protein structures. +//! +//! ## Key Components +//! +//! - [`library`] - Rotamer library management and loading from external files +//! - [`rotamer`] - Core data structures for representing individual rotamers +//! +//! ## Scientific Background +//! +//! Protein side chains can adopt multiple discrete conformations (rotamers) due to +//! rotations around single bonds. The most common rotamers are pre-computed from +//! structural databases and used in: +//! +//! - **Protein structure prediction** - Selecting optimal side-chain conformations +//! - **Protein design** - Exploring sequence-conformation relationships +//! - **Molecular docking** - Avoiding steric clashes during ligand binding +//! - **Structure refinement** - Optimizing side-chain packing in crystal structures +//! +//! ## Usage +//! +//! The rotamers module is typically used through the [`library::RotamerLibrary`] which +//! provides access to pre-computed rotamer conformations for different amino acid types. +//! +//! ```ignore +//! use screampp::core::rotamers::library::RotamerLibrary; +//! +//! let library = RotamerLibrary::load("rotamers.toml", &topology, &forcefield, 1.0)?; +//! let ala_rotamers = library.get_rotamers_for(ResidueType::Alanine); +//! ``` + pub mod library; pub mod rotamer; From 61cb420048c36319662358e3de007d4be41d57d5 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:09:39 -0700 Subject: [PATCH 39/83] docs(core): Add documentation for Core Module, detailing overview, architecture, key capabilities, and scientific foundation --- crates/scream-core/src/core/mod.rs | 43 ++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/crates/scream-core/src/core/mod.rs b/crates/scream-core/src/core/mod.rs index 6ba3c174..10de6856 100644 --- a/crates/scream-core/src/core/mod.rs +++ b/crates/scream-core/src/core/mod.rs @@ -1,3 +1,46 @@ +//! # Core Module +//! +//! This module provides the fundamental building blocks and algorithms for protein +//! side-chain placement and molecular modeling in SCREAM++, serving as the computational +//! core of the library. +//! +//! ## Overview +//! +//! The core module implements the essential data structures, algorithms, and utilities +//! required for automated protein side-chain conformation prediction. It provides a +//! complete framework for representing molecular systems, computing interaction energies, +//! and managing conformational libraries. +//! +//! ## Architecture +//! +//! The module is organized into specialized submodules that handle different aspects +//! of molecular modeling: +//! +//! - **Molecular Representation** ([`models`]) - Data structures for atoms, residues, chains, and systems +//! - **Energy Calculations** ([`forcefield`]) - Force field parameters and energy computation +//! - **File I/O** ([`io`]) - Reading/writing molecular file formats with canonical ordering +//! - **Structural Knowledge** ([`topology`]) - Residue topology definitions and atom classification +//! - **Conformational Libraries** ([`rotamers`]) - Pre-computed side-chain rotamer collections +//! +//! ## Key Capabilities +//! +//! - **Complete molecular system representation** with efficient data structures +//! - **Molecular mechanics energy calculations** using classical force fields +//! - **Multi-format file I/O** with consistent atom/residue ordering +//! - **Rotamer library management** for side-chain conformation sampling +//! - **Topology-aware atom classification** for backbone/sidechain distinction +//! - **Extensible force field support** for different parameter sets +//! +//! ## Scientific Foundation +//! +//! The core module implements algorithms based on established computational chemistry +//! principles: +//! +//! - **Molecular mechanics** for energy minimization and conformational analysis +//! - **Rotamer libraries** derived from statistical analysis of protein structures +//! - **Force field methods** including Lennard-Jones, Coulomb, and hydrogen bonding potentials +//! - **Topology-based modeling** respecting molecular connectivity and stereochemistry + pub mod forcefield; pub mod io; pub mod models; From daa623b2a0b9396cc5132858192663fcc4bab79f Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:25:13 -0700 Subject: [PATCH 40/83] docs(engine): Enhance EngineError enum with additional error variants and detailed documentation --- crates/scream-core/src/engine/error.rs | 74 ++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/crates/scream-core/src/engine/error.rs b/crates/scream-core/src/engine/error.rs index 0cbe4e95..712a38bb 100644 --- a/crates/scream-core/src/engine/error.rs +++ b/crates/scream-core/src/engine/error.rs @@ -7,53 +7,127 @@ use crate::core::rotamers::library::LibraryLoadError; use crate::core::topology::registry::TopologyLoadError; use thiserror::Error; +/// Represents errors that can occur during SCREAM++ engine operations. +/// +/// This enum encompasses all possible failure modes that may arise during +/// protein side-chain placement, optimization, and related molecular modeling +/// operations. Each variant provides specific context about the type of error +/// and the operation that failed. #[derive(Debug, Error)] pub enum EngineError { + /// Indicates that the engine initialization process failed. + /// + /// This error occurs when the SCREAM++ engine cannot be properly set up + /// due to configuration issues, missing dependencies, or invalid parameters. + /// Common causes include malformed configuration files or incompatible settings. #[error("Initialization failed: {0}")] Initialization(String), + /// Indicates that a specified residue could not be found in the molecular system. + /// + /// This error occurs when attempting to operate on a residue that doesn't exist + /// in the current system, possibly due to incorrect residue identifiers or + /// system state inconsistencies. #[error("Residue not found in system: {spec:?}")] ResidueNotFound { spec: ResidueSpecifier }, + /// Indicates that an energy scoring operation failed. + /// + /// This error wraps failures from the energy calculation subsystem, which + /// may occur due to invalid atom coordinates, missing force field parameters, + /// or numerical instabilities during energy evaluation. #[error("Energy scoring failed: {source}")] Scoring { + /// The underlying scoring error that occurred. #[from] source: ScoringError, }, + /// Indicates that topology information is missing for a residue type. + /// + /// This error occurs when attempting to process a residue for which no + /// topology definition exists in the registry, preventing proper atom + /// classification and connectivity analysis. #[error("Topology not found for residue: {residue_name}")] TopologyNotFound { residue_name: String }, + /// Indicates an error related to rotamer library operations for a specific residue type. + /// + /// This error occurs when issues arise with rotamer data access or processing, + /// such as missing rotamer definitions, corrupted library files, or + /// incompatible rotamer formats. #[error("Rotamer library error for residue {residue_type:?}: {message}")] RotamerLibrary { + /// The residue type that encountered the library error. residue_type: String, + /// Detailed description of the library error. message: String, }, + /// Indicates that rotamer placement failed on a specific residue. + /// + /// This error occurs when the side-chain placement algorithm cannot successfully + /// position a rotamer on the target residue, typically due to steric clashes, + /// geometric constraints, or optimization failures. #[error("Failed to place rotamer on residue {residue_id:?}: {message}")] Placement { + /// The ID of the residue where placement failed. residue_id: ResidueId, + /// Detailed description of the placement failure. message: String, }, + /// Indicates that a specific optimization phase failed during execution. + /// + /// This error occurs when one of the algorithmic phases in the side-chain + /// placement pipeline encounters an unrecoverable error, such as numerical + /// instabilities or constraint violations. #[error("Optimization phase '{phase}' failed: {reason}")] PhaseFailed { phase: &'static str, reason: String }, + /// Indicates that the optimization algorithm failed to converge within the allowed iterations. + /// + /// This error occurs when iterative optimization methods cannot reach the + /// convergence criteria within the specified maximum number of iterations, + /// suggesting the problem may be ill-conditioned or require different parameters. #[error("Algorithm failed to converge after {iterations} iterations")] Convergence { iterations: usize }, + /// Indicates an internal logic error in the engine. + /// + /// This error represents unexpected conditions or programming errors that + /// should not occur under normal operation, such as inconsistent internal state + /// or violated invariants. #[error("Internal logic error: {0}")] Internal(String), + /// Indicates that force field parameter loading failed. + /// + /// This error wraps failures from the parameter loading subsystem, which + /// may occur due to missing parameter files, malformed data, or incompatible + /// parameter formats. #[error("Failed to load forcefield parameters: {0}")] ParamLoad(#[from] ParamLoadError), + /// Indicates that rotamer library loading failed. + /// + /// This error wraps failures from the rotamer library loading process, + /// including file I/O errors, parsing failures, and data validation issues. #[error("Failed to load rotamer library: {0}")] LibraryLoad(#[from] LibraryLoadError), + /// Indicates that topology registry loading failed. + /// + /// This error wraps failures from the topology loading subsystem, which + /// may occur due to missing topology files or malformed topology definitions. #[error("Failed to load topology registry: {0}")] TopologyLoad(#[from] TopologyLoadError), + /// Indicates that system parameterization failed. + /// + /// This error wraps failures from the parameterization process, which assigns + /// force field parameters to atoms and may fail due to missing parameters + /// or incompatible atom types. #[error("Failed to parameterize system: {0}")] Parameterization(#[from] ParameterizationError), } From 84b9f28708ad264fd495f75b8a4c8f30f5966f55 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:39:58 -0700 Subject: [PATCH 41/83] docs(engine): Expand configuration structs and enums with detailed documentation and builder patterns for enhanced usability --- crates/scream-core/src/engine/config.rs | 539 +++++++++++++++++++++++- 1 file changed, 535 insertions(+), 4 deletions(-) diff --git a/crates/scream-core/src/engine/config.rs b/crates/scream-core/src/engine/config.rs index 2854163c..e11a12d6 100644 --- a/crates/scream-core/src/engine/config.rs +++ b/crates/scream-core/src/engine/config.rs @@ -4,80 +4,214 @@ use std::collections::HashMap; use std::path::PathBuf; use thiserror::Error; +/// Errors that can occur during configuration building and validation. +/// +/// This enum defines the possible errors that may arise when constructing +/// configuration objects for placement, design, or analysis operations in SCREAM++. +/// Each variant provides specific information about what went wrong to aid in +/// debugging configuration issues. #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ConfigError { + /// A required configuration parameter was not provided. + /// + /// This error is returned when attempting to build a configuration without + /// supplying all mandatory parameters. The parameter name is included in + /// the error message to help identify what needs to be specified. #[error("Missing required parameter: {0}")] MissingParameter(&'static str), } +/// Uniquely identifies a residue within a protein structure. +/// +/// This struct serves as a key to reference specific amino acid residues in +/// molecular structures. It combines the chain identifier with the residue +/// sequence number to provide unambiguous targeting for various computational +/// operations such as side-chain placement, design, or analysis. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ResidueSpecifier { + /// The character identifier of the protein chain containing this residue. pub chain_id: char, + /// The sequential position number of the residue within its chain. pub residue_number: isize, } +/// Defines how to select residues for computational operations. +/// +/// This enum provides flexible ways to specify which residues should be +/// included or excluded from operations like side-chain optimization or +/// protein design. It supports selecting all residues, specific lists, +/// or regions around a ligand binding site. #[derive(Debug, Clone, PartialEq)] pub enum ResidueSelection { + /// Select all residues in the structure. All, + /// Select specific residues with optional exclusions. + /// + /// # Arguments + /// + /// * `include` - List of residues to include in the selection. + /// * `exclude` - List of residues to exclude from the selection. List { include: Vec, exclude: Vec, }, + /// Select residues within a specified radius of a ligand. + /// + /// # Arguments + /// + /// * `ligand_residue` - The residue identifier of the ligand. + /// * `radius_angstroms` - The radius in Angstroms around the ligand. LigandBindingSite { ligand_residue: ResidueSpecifier, radius_angstroms: f64, }, } +/// Configuration parameters for optimization convergence criteria. +/// +/// This struct defines the thresholds and patience settings used to determine +/// when an optimization algorithm has converged to a stable solution. It helps +/// balance computational efficiency with solution quality in iterative +/// refinement processes. #[derive(Debug, Clone, PartialEq)] pub struct ConvergenceConfig { + /// The minimum energy difference threshold for convergence. + /// + /// Optimization stops when the energy change between iterations + /// falls below this value (in kcal/mol). pub energy_threshold: f64, + /// Number of iterations to wait before checking convergence. + /// + /// This prevents premature termination due to temporary energy fluctuations + /// during the early stages of optimization. pub patience_iterations: usize, } +/// Configuration for simulated annealing optimization. +/// +/// This struct contains the parameters that control the simulated annealing +/// algorithm used for global optimization of side-chain conformations. The +/// algorithm uses a temperature schedule to explore the conformational space +/// and gradually converge to optimal solutions. #[derive(Debug, Clone, PartialEq)] pub struct SimulatedAnnealingConfig { + /// The starting temperature for the annealing process. + /// + /// Higher values allow more exploration of the conformational space + /// at the beginning of optimization. pub initial_temperature: f64, + /// The final temperature at which annealing terminates. + /// + /// Lower values focus the search on local optima near the end. pub final_temperature: f64, + /// The factor by which temperature decreases each step. + /// + /// Values between 0.8 and 0.99 are typical, with lower values + /// providing faster cooling. pub cooling_rate: f64, + /// Number of optimization steps performed at each temperature level. pub steps_per_temperature: usize, } +/// Configuration for force field parameters and energy calculations. +/// +/// This struct specifies the force field files and parameters used for +/// molecular mechanics energy calculations. It includes the main force field +/// parameters, delta parameters for side-chain corrections, and energy +/// weighting factors. #[derive(Debug, Clone, PartialEq)] pub struct ForcefieldConfig { + /// Path to the main force field parameter file. pub forcefield_path: PathBuf, + /// Path to the delta parameter file for side-chain corrections. pub delta_params_path: PathBuf, + /// Scaling factor for non-bonded interactions. + /// + /// This parameter adjusts the strength of van der Waals and electrostatic + /// interactions in the force field. pub s_factor: f64, + /// Relative weights for different energy components. pub energy_weights: EnergyWeights, } +/// Configuration for rotamer sampling during optimization. +/// +/// This struct specifies the rotamer library used to sample side-chain +/// conformations during placement and design operations. Rotamer libraries +/// contain pre-computed, energetically favorable side-chain orientations. #[derive(Debug, Clone, PartialEq)] pub struct SamplingConfig { + /// Path to the rotamer library file. pub rotamer_library_path: PathBuf, } +/// Configuration for the overall optimization process. +/// +/// This struct defines the parameters that control the optimization algorithm, +/// including iteration limits, solution generation, convergence criteria, +/// and optional simulated annealing settings. #[derive(Debug, Clone, PartialEq)] pub struct OptimizationConfig { + /// Maximum number of optimization iterations. pub max_iterations: usize, + /// Number of solution candidates to generate. pub num_solutions: usize, + /// Whether to include the input conformation as a candidate. pub include_input_conformation: bool, + /// Convergence criteria for optimization termination. pub convergence: ConvergenceConfig, + /// Optional simulated annealing configuration. + /// + /// If None, standard optimization without annealing is used. pub simulated_annealing: Option, + /// Number of final refinement iterations after convergence. pub final_refinement_iterations: usize, } +/// Complete configuration for side-chain placement operations. +/// +/// This struct encapsulates all parameters needed to perform automated +/// side-chain placement on a protein structure. It includes force field +/// settings, sampling parameters, optimization controls, and residue +/// selection criteria. #[derive(Debug, Clone, PartialEq)] pub struct PlacementConfig { + /// Force field configuration for energy calculations. pub forcefield: ForcefieldConfig, + /// Rotamer sampling configuration. pub sampling: SamplingConfig, + /// Optimization algorithm parameters. pub optimization: OptimizationConfig, + /// Selection of residues to optimize. pub residues_to_optimize: ResidueSelection, + /// Path to the topology registry file. pub topology_registry_path: PathBuf, } +/// Type alias for specifying amino acid mutations in protein design. +/// +/// This type maps residue specifiers to lists of allowed amino acid types, +/// defining the design space for computational protein design operations. +/// Each entry specifies which residue positions can mutate to which amino acids. pub type DesignSpec = HashMap>; +/// Extension trait for DesignSpec providing convenient access methods. +/// +/// This trait adds utility methods to the DesignSpec type alias to simplify +/// common operations when working with design specifications in protein +/// design workflows. pub trait DesignSpecExt { + /// Retrieves the allowed amino acid types for a specific residue. + /// + /// # Arguments + /// + /// * `chain_id` - The chain identifier of the residue. + /// * `residue_number` - The sequence number of the residue. + /// + /// # Return + /// + /// Returns `Some(Vec)` if the residue is in the design spec, + /// or `None` if it is not specified. fn get_by_specifier(&self, chain_id: char, residue_number: isize) -> Option<&Vec>; } @@ -90,41 +224,87 @@ impl DesignSpecExt for DesignSpec { } } +/// Complete configuration for computational protein design operations. +/// +/// This struct defines all parameters required for protein design, including +/// force field settings, rotamer sampling, optimization parameters, the design +/// specification, and neighbor repacking criteria. #[derive(Debug, Clone, PartialEq)] pub struct DesignConfig { + /// Force field configuration for energy calculations. pub forcefield: ForcefieldConfig, + /// Rotamer sampling configuration. pub sampling: SamplingConfig, + /// Optimization algorithm parameters. pub optimization: OptimizationConfig, + /// Specification of allowed mutations at each position. pub design_spec: DesignSpec, + /// Selection of neighboring residues to repack during design. pub neighbors_to_repack: ResidueSelection, + /// Path to the topology registry file. pub topology_registry_path: PathBuf, } +/// Defines how to select atoms for analysis operations. +/// +/// This enum provides ways to specify which atoms should be included in +/// molecular analysis calculations, such as interaction energy computations +/// or clash detection. #[derive(Debug, Clone, PartialEq)] pub enum AtomSelection { + /// Select a single residue's atoms. Residue(ResidueSpecifier), + /// Select all atoms in a specific chain. Chain(char), + /// Select all atoms in the structure. All, } +/// Specifies the type of molecular analysis to perform. +/// +/// This enum defines the available analysis operations that can be performed +/// on molecular structures, including interaction energy calculations and +/// steric clash detection. #[derive(Debug, Clone, PartialEq)] pub enum AnalysisType { + /// Calculate interaction energies between two atom groups. + /// + /// # Arguments + /// + /// * `group1` - First group of atoms for interaction calculation. + /// * `group2` - Second group of atoms for interaction calculation. Interaction { group1: AtomSelection, group2: AtomSelection, }, - ClashDetection { - threshold_kcal_mol: f64, - }, + /// Detect steric clashes above a threshold. + /// + /// # Arguments + /// + /// * `threshold_kcal_mol` - Energy threshold for clash detection in kcal/mol. + ClashDetection { threshold_kcal_mol: f64 }, } +/// Complete configuration for molecular analysis operations. +/// +/// This struct contains all parameters needed to perform analysis on +/// molecular structures, including force field settings and the specific +/// type of analysis to conduct. #[derive(Debug, Clone, PartialEq)] pub struct AnalyzeConfig { + /// Force field configuration for energy calculations. pub forcefield: ForcefieldConfig, + /// The type of analysis to perform. pub analysis_type: AnalysisType, + /// Path to the topology registry file. pub topology_registry_path: PathBuf, } +/// Builder pattern implementation for constructing PlacementConfig. +/// +/// This struct provides a fluent interface for building placement configurations +/// with validation. It ensures all required parameters are provided and +/// uses sensible defaults for optional fields. #[derive(Default)] pub struct PlacementConfigBuilder { forcefield_path: Option, @@ -143,63 +323,194 @@ pub struct PlacementConfigBuilder { } impl PlacementConfigBuilder { + /// Creates a new builder with default values. + /// + /// # Return + /// + /// Returns a new `PlacementConfigBuilder` instance with all fields unset. pub fn new() -> Self { Self::default() } - + /// Sets the force field parameter file path. + /// + /// # Arguments + /// + /// * `path` - Path to the force field parameter file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn forcefield_path(mut self, path: impl Into) -> Self { self.forcefield_path = Some(path.into()); self } + /// Sets the delta parameters file path. + /// + /// # Arguments + /// + /// * `path` - Path to the delta parameters file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn delta_params_path(mut self, path: impl Into) -> Self { self.delta_params_path = Some(path.into()); self } + /// Sets the scaling factor for non-bonded interactions. + /// + /// # Arguments + /// + /// * `factor` - The scaling factor value. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn s_factor(mut self, factor: f64) -> Self { self.s_factor = Some(factor); self } + /// Sets the energy weights for different force field components. + /// + /// # Arguments + /// + /// * `weights` - The energy weights configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn energy_weights(mut self, weights: EnergyWeights) -> Self { self.energy_weights = Some(weights); self } + /// Sets the rotamer library file path. + /// + /// # Arguments + /// + /// * `path` - Path to the rotamer library file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn rotamer_library_path(mut self, path: impl Into) -> Self { self.rotamer_library_path = Some(path.into()); self } + /// Sets the topology registry file path. + /// + /// # Arguments + /// + /// * `path` - Path to the topology registry file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn topology_registry_path(mut self, path: impl Into) -> Self { self.topology_registry_path = Some(path.into()); self } + /// Sets the maximum number of optimization iterations. + /// + /// # Arguments + /// + /// * `iterations` - The maximum number of iterations. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn max_iterations(mut self, iterations: usize) -> Self { self.max_iterations = Some(iterations); self } + /// Sets the number of solution candidates to generate. + /// + /// # Arguments + /// + /// * `n` - The number of solutions. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn num_solutions(mut self, n: usize) -> Self { self.num_solutions = Some(n); self } + /// Sets whether to include the input conformation as a candidate. + /// + /// # Arguments + /// + /// * `include` - Whether to include the input conformation. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn include_input_conformation(mut self, include: bool) -> Self { self.include_input_conformation = Some(include); self } + /// Sets the convergence configuration. + /// + /// # Arguments + /// + /// * `config` - The convergence configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn convergence_config(mut self, config: ConvergenceConfig) -> Self { self.convergence_config = Some(config); self } + /// Sets the simulated annealing configuration. + /// + /// # Arguments + /// + /// * `config` - The simulated annealing configuration, or None to disable. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn simulated_annealing_config(mut self, config: Option) -> Self { self.simulated_annealing_config = config; self } + /// Sets the number of final refinement iterations. + /// + /// # Arguments + /// + /// * `iterations` - The number of refinement iterations. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn final_refinement_iterations(mut self, iterations: usize) -> Self { self.final_refinement_iterations = Some(iterations); self } + /// Sets the residue selection for optimization. + /// + /// # Arguments + /// + /// * `selection` - The residue selection criteria. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn residues_to_optimize(mut self, selection: ResidueSelection) -> Self { self.residues_to_optimize = Some(selection); self } + /// Builds the PlacementConfig from the current builder state. + /// + /// # Return + /// + /// Returns `Ok(PlacementConfig)` if all required parameters are set, + /// or `Err(ConfigError)` if any required parameter is missing. + /// + /// # Errors + /// + /// Returns `ConfigError::MissingParameter` if any required field is not set. pub fn build(self) -> Result { let forcefield = ForcefieldConfig { forcefield_path: self @@ -249,6 +560,11 @@ impl PlacementConfigBuilder { } } +/// Builder pattern implementation for constructing DesignConfig. +/// +/// This struct provides a fluent interface for building design configurations +/// with validation. It ensures all required parameters are provided and +/// uses sensible defaults for optional fields. #[derive(Default)] pub struct DesignConfigBuilder { forcefield_path: Option, @@ -268,70 +584,211 @@ pub struct DesignConfigBuilder { } impl DesignConfigBuilder { + /// Creates a new builder with default values. + /// + /// # Return + /// + /// Returns a new `DesignConfigBuilder` instance with all fields unset. pub fn new() -> Self { Self::default() } + /// Sets the force field parameter file path. + /// + /// # Arguments + /// + /// * `path` - Path to the force field parameter file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn forcefield_path(mut self, path: impl Into) -> Self { self.forcefield_path = Some(path.into()); self } + /// Sets the delta parameters file path. + /// + /// # Arguments + /// + /// * `path` - Path to the delta parameters file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn delta_params_path(mut self, path: impl Into) -> Self { self.delta_params_path = Some(path.into()); self } + /// Sets the scaling factor for non-bonded interactions. + /// + /// # Arguments + /// + /// * `factor` - The scaling factor value. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn s_factor(mut self, factor: f64) -> Self { self.s_factor = Some(factor); self } + /// Sets the energy weights for different force field components. + /// + /// # Arguments + /// + /// * `weights` - The energy weights configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn energy_weights(mut self, weights: EnergyWeights) -> Self { self.energy_weights = Some(weights); self } + /// Sets the rotamer library file path. + /// + /// # Arguments + /// + /// * `path` - Path to the rotamer library file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn rotamer_library_path(mut self, path: impl Into) -> Self { self.rotamer_library_path = Some(path.into()); self } + /// Sets the topology registry file path. + /// + /// # Arguments + /// + /// * `path` - Path to the topology registry file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn topology_registry_path(mut self, path: impl Into) -> Self { self.topology_registry_path = Some(path.into()); self } + /// Sets the maximum number of optimization iterations. + /// + /// # Arguments + /// + /// * `iterations` - The maximum number of iterations. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn max_iterations(mut self, iterations: usize) -> Self { self.max_iterations = Some(iterations); self } + /// Sets the number of solution candidates to generate. + /// + /// # Arguments + /// + /// * `n` - The number of solutions. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn num_solutions(mut self, n: usize) -> Self { self.num_solutions = Some(n); self } + /// Sets whether to include the input conformation as a candidate. + /// + /// # Arguments + /// + /// * `include` - Whether to include the input conformation. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn include_input_conformation(mut self, include: bool) -> Self { self.include_input_conformation = Some(include); self } + /// Sets the convergence configuration. + /// + /// # Arguments + /// + /// * `config` - The convergence configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn convergence_config(mut self, config: ConvergenceConfig) -> Self { self.convergence_config = Some(config); self } + /// Sets the simulated annealing configuration. + /// + /// # Arguments + /// + /// * `config` - The simulated annealing configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn simulated_annealing_config(mut self, config: SimulatedAnnealingConfig) -> Self { self.simulated_annealing_config = Some(config); self } + /// Sets the number of final refinement iterations. + /// + /// # Arguments + /// + /// * `iterations` - The number of refinement iterations. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn final_refinement_iterations(mut self, iterations: usize) -> Self { self.final_refinement_iterations = Some(iterations); self } + /// Sets the design specification. + /// + /// # Arguments + /// + /// * `spec` - The design specification mapping residues to allowed types. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn design_spec(mut self, spec: DesignSpec) -> Self { self.design_spec = Some(spec); self } + /// Sets the residue selection for neighbors to repack. + /// + /// # Arguments + /// + /// * `selection` - The residue selection criteria for neighbors. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn neighbors_to_repack(mut self, selection: ResidueSelection) -> Self { self.neighbors_to_repack = Some(selection); self } + /// Builds the DesignConfig from the current builder state. + /// + /// # Return + /// + /// Returns `Ok(DesignConfig)` if all required parameters are set, + /// or `Err(ConfigError)` if any required parameter is missing. + /// + /// # Errors + /// + /// Returns `ConfigError::MissingParameter` if any required field is not set. pub fn build(self) -> Result { let forcefield = ForcefieldConfig { forcefield_path: self @@ -386,6 +843,11 @@ impl DesignConfigBuilder { } } +/// Builder pattern implementation for constructing AnalyzeConfig. +/// +/// This struct provides a fluent interface for building analysis configurations +/// with validation. It ensures all required parameters are provided and +/// uses sensible defaults for optional fields. #[derive(Default)] pub struct AnalyzeConfigBuilder { forcefield_path: Option, @@ -397,35 +859,104 @@ pub struct AnalyzeConfigBuilder { } impl AnalyzeConfigBuilder { + /// Creates a new builder with default values. + /// + /// # Return + /// + /// Returns a new `AnalyzeConfigBuilder` instance with all fields unset. pub fn new() -> Self { Self::default() } + /// Sets the force field parameter file path. + /// + /// # Arguments + /// + /// * `path` - Path to the force field parameter file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn forcefield_path(mut self, path: impl Into) -> Self { self.forcefield_path = Some(path.into()); self } + /// Sets the delta parameters file path. + /// + /// # Arguments + /// + /// * `path` - Path to the delta parameters file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn delta_params_path(mut self, path: impl Into) -> Self { self.delta_params_path = Some(path.into()); self } + /// Sets the scaling factor for non-bonded interactions. + /// + /// # Arguments + /// + /// * `factor` - The scaling factor value. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn s_factor(mut self, factor: f64) -> Self { self.s_factor = Some(factor); self } + /// Sets the energy weights for different force field components. + /// + /// # Arguments + /// + /// * `weights` - The energy weights configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn energy_weights(mut self, weights: EnergyWeights) -> Self { self.energy_weights = Some(weights); self } + /// Sets the topology registry file path. + /// + /// # Arguments + /// + /// * `path` - Path to the topology registry file. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn topology_registry_path(mut self, path: impl Into) -> Self { self.topology_registry_path = Some(path.into()); self } + /// Sets the type of analysis to perform. + /// + /// # Arguments + /// + /// * `analysis_type` - The analysis type configuration. + /// + /// # Return + /// + /// Returns the builder for method chaining. pub fn analysis_type(mut self, analysis_type: AnalysisType) -> Self { self.analysis_type = Some(analysis_type); self } + /// Builds the AnalyzeConfig from the current builder state. + /// + /// # Return + /// + /// Returns `Ok(AnalyzeConfig)` if all required parameters are set, + /// or `Err(ConfigError)` if any required parameter is missing. + /// + /// # Errors + /// + /// Returns `ConfigError::MissingParameter` if any required field is not set. pub fn build(self) -> Result { let forcefield = ForcefieldConfig { forcefield_path: self From bbb2ad644e48fa4ed8cfe7dd35db25de66e884e1 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:41:48 -0700 Subject: [PATCH 42/83] docs(engine): Add detailed documentation for OptimizationContext and AnalysisContext --- crates/scream-core/src/engine/context.rs | 108 +++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/crates/scream-core/src/engine/context.rs b/crates/scream-core/src/engine/context.rs index ddd8a459..76dfd399 100644 --- a/crates/scream-core/src/engine/context.rs +++ b/crates/scream-core/src/engine/context.rs @@ -10,16 +10,28 @@ use crate::core::{ }; use std::collections::HashSet; +/// Provides a unified context for optimization operations in SCREAM++. +/// +/// This struct encapsulates all the data and configuration needed to perform +/// side-chain placement and protein design operations. It provides convenient +/// access to the molecular system, force field, rotamer library, and other +/// essential components while ensuring thread safety through the Sync bound. #[derive(Clone, Copy)] pub struct OptimizationContext<'a, C> where C: ProvidesResidueSelections + Sync, { + /// Reference to the molecular system being optimized. pub system: &'a MolecularSystem, + /// Reference to the force field parameters for energy calculations. pub forcefield: &'a Forcefield, + /// Progress reporter for tracking optimization progress. pub reporter: &'a ProgressReporter<'a>, + /// Configuration object providing residue selection criteria. pub config: &'a C, + /// Reference to the rotamer library for sampling conformations. pub rotamer_library: &'a RotamerLibrary, + /// Reference to the topology registry for molecular structure definitions. pub topology_registry: &'a TopologyRegistry, } @@ -27,6 +39,20 @@ impl<'a, C> OptimizationContext<'a, C> where C: ProvidesResidueSelections + Sync, { + /// Creates a new optimization context with the provided components. + /// + /// # Arguments + /// + /// * `system` - The molecular system to optimize. + /// * `forcefield` - The force field for energy calculations. + /// * `reporter` - Progress reporter for tracking operations. + /// * `config` - Configuration providing residue selections. + /// * `rotamer_library` - Library of rotamer conformations. + /// * `topology_registry` - Registry of molecular topologies. + /// + /// # Return + /// + /// Returns a new `OptimizationContext` instance. pub fn new( system: &'a MolecularSystem, forcefield: &'a Forcefield, @@ -45,14 +71,43 @@ where } } + /// Resolves the repack residue selection to a set of residue IDs. + /// + /// This method converts the residue selection criteria from the configuration + /// into actual residue IDs present in the molecular system, filtering for + /// residues that have available rotamers. + /// + /// # Return + /// + /// Returns a `HashSet` of `ResidueId`s representing residues to repack. + /// + /// # Errors + /// + /// Returns `EngineError` if the selection cannot be resolved or if there + /// are issues with the molecular system structure. pub fn resolve_repack_residues(&self) -> Result, EngineError> { let selection = self.config.repack_selection(); query::resolve_selection_to_ids(self.system, selection, self.rotamer_library) } + /// Resolves the design residue selection to a set of residue IDs. + /// + /// For design operations, this method extracts residue IDs from the design + /// specification. If no design spec is provided, returns an empty set. + /// + /// # Return + /// + /// Returns a `HashSet` of `ResidueId`s representing residues to design. + /// + /// # Errors + /// + /// Returns `EngineError` if the selection cannot be resolved or if there + /// are issues with the molecular system structure. pub fn resolve_design_residues(&self) -> Result, EngineError> { match self.config.design_spec() { Some(spec) => { + // Extract residue specifiers from the design specification keys + // to create a selection for residues that will be mutated let specifiers: Vec<_> = spec.keys().cloned().collect(); let selection = ResidueSelection::List { include: specifiers, @@ -64,6 +119,18 @@ where } } + /// Resolves all active residues for the optimization operation. + /// + /// This combines both repack and design residues into a single set, + /// representing all residues that will be actively optimized. + /// + /// # Return + /// + /// Returns a `HashSet` of `ResidueId`s representing all active residues. + /// + /// # Errors + /// + /// Returns `EngineError` if any residue selection cannot be resolved. pub fn resolve_all_active_residues(&self) -> Result, EngineError> { let repack_ids = self.resolve_repack_residues()?; let design_ids = self.resolve_design_residues()?; @@ -71,16 +138,40 @@ where } } +/// Provides a unified context for analysis operations in SCREAM++. +/// +/// This struct encapsulates the necessary components for performing molecular +/// analysis operations such as interaction energy calculations and steric +/// clash detection. It provides access to the molecular system, force field, +/// and analysis configuration. #[derive(Clone, Copy)] pub struct AnalysisContext<'a> { + /// Reference to the molecular system being analyzed. pub system: &'a MolecularSystem, + /// Reference to the force field parameters for energy calculations. pub forcefield: &'a Forcefield, + /// Progress reporter for tracking analysis progress. pub reporter: &'a ProgressReporter<'a>, + /// Configuration object specifying the analysis type and parameters. pub config: &'a AnalyzeConfig, + /// Reference to the topology registry for molecular structure definitions. pub topology_registry: &'a TopologyRegistry, } impl<'a> AnalysisContext<'a> { + /// Creates a new analysis context with the provided components. + /// + /// # Arguments + /// + /// * `system` - The molecular system to analyze. + /// * `forcefield` - The force field for energy calculations. + /// * `reporter` - Progress reporter for tracking operations. + /// * `config` - Configuration specifying the analysis type. + /// * `topology_registry` - Registry of molecular topologies. + /// + /// # Return + /// + /// Returns a new `AnalysisContext` instance. pub fn new( system: &'a MolecularSystem, forcefield: &'a Forcefield, @@ -98,8 +189,25 @@ impl<'a> AnalysisContext<'a> { } } +/// Defines the interface for configuration objects that provide residue selections. +/// +/// This trait is implemented by configuration structs that need to specify +/// which residues should be included in optimization operations. It provides +/// methods to access repack selections and optional design specifications. pub trait ProvidesResidueSelections { + /// Returns the residue selection for repacking operations. + /// + /// # Return + /// + /// Returns a reference to the `ResidueSelection` specifying which residues + /// should be repacked during optimization. fn repack_selection(&self) -> &ResidueSelection; + /// Returns the design specification if available. + /// + /// # Return + /// + /// Returns `Some(&DesignSpec)` if the configuration includes design operations, + /// or `None` if only repacking is performed. fn design_spec(&self) -> Option<&DesignSpec> { None } From 7dda4ff1307847ce78ed046ac263d5f0097d443a Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:46:56 -0700 Subject: [PATCH 43/83] docs(engine): Add documentation for InitialState, SolutionState, and Solution structs --- crates/scream-core/src/engine/state.rs | 71 ++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/crates/scream-core/src/engine/state.rs b/crates/scream-core/src/engine/state.rs index 427293d8..5ba4cf75 100644 --- a/crates/scream-core/src/engine/state.rs +++ b/crates/scream-core/src/engine/state.rs @@ -2,23 +2,46 @@ use crate::core::models::ids::ResidueId; use crate::core::models::system::MolecularSystem; use std::collections::{BinaryHeap, HashMap}; +/// Represents the initial state of a molecular system before optimization. +/// +/// This struct captures the starting point of an optimization process, including +/// the molecular system configuration and its associated energy values. It serves +/// as a baseline for tracking optimization progress and comparing final results. #[derive(Debug, Clone)] pub struct InitialState { + /// The molecular system in its initial configuration. pub system: MolecularSystem, + /// The total energy of the initial system configuration. pub total_energy: f64, + /// The optimization score used to evaluate the initial state. pub optimization_score: f64, } +/// Represents a specific state of the molecular system with assigned rotamers. +/// +/// This struct stores a snapshot of the molecular system along with the rotamer +/// assignments for each residue. It captures both the structural configuration +/// and the conformational choices made during optimization. #[derive(Debug, Clone)] pub struct SolutionState { + /// The molecular system with its current atomic coordinates and structure. pub system: MolecularSystem, + /// Mapping of residue IDs to their assigned rotamer indices. pub rotamers: HashMap, } +/// Represents a complete solution from the optimization process. +/// +/// This struct encapsulates a solution found during optimization, including its +/// energy values, optimization score, and the complete system state with rotamer +/// assignments. Solutions are comparable based on their optimization scores. #[derive(Debug, Clone)] pub struct Solution { + /// The total energy of this solution configuration. pub total_energy: f64, + /// The optimization score used for ranking and comparison of solutions. pub optimization_score: f64, + /// The complete state of the molecular system and rotamer assignments. pub state: SolutionState, } @@ -37,26 +60,51 @@ impl PartialOrd for Solution { } impl Ord for Solution { + // Solutions are ordered by optimization score, with lower scores being better + // This makes the BinaryHeap act as a max-heap where the "largest" (worst) solution is at the top fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.partial_cmp(other).unwrap_or(std::cmp::Ordering::Equal) } } +/// Manages the state of an ongoing optimization process. +/// +/// This struct tracks the current working state of the molecular system during +/// optimization, maintains a collection of the best solutions found so far, and +/// provides methods for submitting new solutions and retrieving results. It uses +/// a binary heap to efficiently maintain the top N solutions by optimization score. #[derive(Debug, Clone)] pub(crate) struct OptimizationState { + /// The current working state of the molecular system being optimized. pub working_state: SolutionState, + /// The optimization score of the current working state. pub current_optimization_score: f64, + /// Binary heap storing the best solutions found, ordered by optimization score. solutions: BinaryHeap, + /// Maximum number of solutions to maintain in the collection. max_solutions: usize, } impl OptimizationState { + /// Creates a new optimization state with the initial system configuration. + /// + /// # Arguments + /// + /// * `initial_system` - The starting molecular system configuration. + /// * `initial_rotamers` - Initial rotamer assignments for residues. + /// * `initial_optimization_score` - Starting optimization score. + /// * `max_solutions` - Maximum number of solutions to track (minimum 1). + /// + /// # Return + /// + /// Returns a new `OptimizationState` instance initialized with the provided parameters. pub fn new( initial_system: MolecularSystem, initial_rotamers: HashMap, initial_optimization_score: f64, max_solutions: usize, ) -> Self { + // Ensure at least one solution is tracked to avoid empty collections let max_s = if max_solutions == 0 { 1 } else { max_solutions }; let initial_state = SolutionState { @@ -79,6 +127,12 @@ impl OptimizationState { } } + /// Submits the current working state as a potential solution. + /// + /// This method evaluates the current working state and adds it to the solution + /// collection if it improves upon the existing solutions. If the collection is + /// at capacity, it replaces the worst solution if the current state is better. + /// The optimization score determines solution quality, with lower scores being better. pub fn submit_current_solution(&mut self) { if self.solutions.len() < self.max_solutions { self.solutions.push(Solution { @@ -89,6 +143,8 @@ impl OptimizationState { return; } + // BinaryHeap is a max-heap, so peek() gives the worst solution (highest score) + // Replace it only if current solution has a better (lower) score if let Some(worst_of_the_best) = self.solutions.peek() { if self.current_optimization_score < worst_of_the_best.optimization_score { let mut worst_solution_placeholder = self.solutions.pop().unwrap(); @@ -99,10 +155,25 @@ impl OptimizationState { } } + /// Consumes the optimization state and returns all solutions in sorted order. + /// + /// Solutions are sorted by optimization score in ascending order (best first). + /// This method takes ownership of the state, so it should be called when + /// optimization is complete. + /// + /// # Return + /// + /// Returns a `Vec` containing all tracked solutions, sorted by quality. pub fn into_sorted_solutions(self) -> Vec { self.solutions.into_sorted_vec() } + /// Returns the best optimization score among all tracked solutions. + /// + /// # Return + /// + /// Returns the minimum optimization score (best value) from all solutions, + /// or `f64::INFINITY` if no solutions are available. pub fn best_energy(&self) -> f64 { self.solutions .iter() From b14b02e771e999893627ebfc05d47f2db9c6800e Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:48:50 -0700 Subject: [PATCH 44/83] docs(engine): Add documentation for Progress enum and ProgressReporter struct --- crates/scream-core/src/engine/progress.rs | 68 +++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/crates/scream-core/src/engine/progress.rs b/crates/scream-core/src/engine/progress.rs index 614ade44..b4c1c1df 100644 --- a/crates/scream-core/src/engine/progress.rs +++ b/crates/scream-core/src/engine/progress.rs @@ -1,35 +1,103 @@ +/// Represents different types of progress events during SCREAM++ operations. +/// +/// This enum defines the various progress reporting events that can be emitted +/// during molecular optimization and analysis operations. It allows external +/// code to track the progress of long-running computations and provide user feedback. #[derive(Debug, Clone, PartialEq)] pub enum Progress { + /// Signals the start of a new computational phase. + /// + /// # Arguments + /// + /// * `name` - A static string identifier for the phase being started. PhaseStart { name: &'static str }, + /// Signals the completion of the current computational phase. PhaseFinish, + /// Signals the start of a task with a known total amount of work. + /// + /// # Arguments + /// + /// * `total` - The total number of work units expected for this task. TaskStart { total: u64 }, + /// Signals incremental progress on the current task. + /// + /// # Arguments + /// + /// * `amount` - The number of work units completed in this increment. TaskIncrement { amount: u64 }, + /// Signals the completion of the current task. TaskFinish, + /// Provides a status update message for the current operation. + /// + /// # Arguments + /// + /// * `text` - A descriptive message about the current status. StatusUpdate { text: String }, + /// Provides a general informational message. + /// + /// The message text to display. Message(String), } +/// A callback function type for handling progress events. +/// +/// This type alias defines the signature for functions that can receive and +/// process progress events. The callback must be thread-safe (Send + Sync) and +/// can capture variables from its environment. pub type ProgressCallback<'a> = Box; +/// Provides a mechanism for reporting progress events during computations. +/// +/// This struct manages progress reporting for SCREAM++ operations. It can optionally +/// hold a callback function that will be invoked whenever progress events are reported. +/// If no callback is provided, progress events are silently ignored, allowing the same +/// code to work with or without progress monitoring. #[derive(Default)] pub struct ProgressReporter<'a> { + /// Optional callback function to handle progress events. callback: Option>, } impl<'a> ProgressReporter<'a> { + /// Creates a new progress reporter with no callback. + /// + /// This constructor creates a reporter that will silently ignore all + /// progress events. This is useful when progress monitoring is not needed + /// or when the callback will be set later. + /// + /// # Return + /// + /// Returns a new `ProgressReporter` instance with no callback configured. pub fn new() -> Self { Self::default() } + /// Creates a new progress reporter with the specified callback. + /// + /// # Arguments + /// + /// * `callback` - The callback function to invoke for progress events. + /// + /// # Return + /// + /// Returns a new `ProgressReporter` instance configured with the provided callback. pub fn with_callback(callback: ProgressCallback<'a>) -> Self { Self { callback: Some(callback), } } + /// Reports a progress event by invoking the configured callback. + /// + /// If no callback is configured, this method does nothing. The method is + /// marked as inline to minimize overhead when progress reporting is disabled. + /// + /// # Arguments + /// + /// * `event` - The progress event to report. #[inline] pub fn report(&self, event: Progress) { if let Some(cb) = &self.callback { From cf9c3b134c3823a8bd881760489348ae724c116b Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:53:25 -0700 Subject: [PATCH 45/83] docs(engine): Add documentation for ELCache struct methods and functionality --- crates/scream-core/src/engine/cache.rs | 78 ++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/crates/scream-core/src/engine/cache.rs b/crates/scream-core/src/engine/cache.rs index c530bb37..372c02f9 100644 --- a/crates/scream-core/src/engine/cache.rs +++ b/crates/scream-core/src/engine/cache.rs @@ -3,16 +3,39 @@ use crate::core::models::ids::ResidueId; use crate::core::models::residue::ResidueType; use std::collections::HashMap; +/// Caches energy terms for residue-rotamer combinations to avoid recomputation. +/// +/// This struct provides an efficient caching mechanism for storing and retrieving +/// energy calculations for different rotamer conformations of protein residues. +/// It uses a nested HashMap structure to organize energies by residue identity +/// and type, enabling fast lookups during optimization processes. #[derive(Debug, Default, Clone)] pub struct ELCache { + /// Internal storage mapping (residue_id, residue_type) to rotamer energies. data: HashMap<(ResidueId, ResidueType), HashMap>, } impl ELCache { + /// Creates a new empty energy cache. + /// + /// # Return + /// + /// Returns a new `ELCache` instance with no cached energies. pub fn new() -> Self { Self::default() } + /// Inserts an energy term for a specific residue-rotamer combination. + /// + /// If an energy term already exists for the given residue and rotamer, + /// it will be overwritten with the new value. + /// + /// # Arguments + /// + /// * `residue_id` - The unique identifier of the residue. + /// * `residue_type` - The type of the residue (e.g., Alanine, Glycine). + /// * `rotamer_idx` - The index of the rotamer conformation. + /// * `energy` - The energy term to cache for this combination. pub fn insert( &mut self, residue_id: ResidueId, @@ -26,6 +49,18 @@ impl ELCache { .insert(rotamer_idx, energy); } + /// Retrieves the cached energy term for a specific residue-rotamer combination. + /// + /// # Arguments + /// + /// * `residue_id` - The unique identifier of the residue. + /// * `residue_type` - The type of the residue. + /// * `rotamer_idx` - The index of the rotamer conformation. + /// + /// # Return + /// + /// Returns `Some(&EnergyTerm)` if the combination exists in the cache, + /// or `None` if no energy has been cached for this combination. pub fn get( &self, residue_id: ResidueId, @@ -37,6 +72,21 @@ impl ELCache { .and_then(|inner_map| inner_map.get(&rotamer_idx)) } + /// Retrieves all cached energy terms for a specific residue. + /// + /// This method returns the complete set of rotamer energies that have + /// been cached for the given residue, allowing iteration over all + /// available conformations. + /// + /// # Arguments + /// + /// * `residue_id` - The unique identifier of the residue. + /// * `residue_type` - The type of the residue. + /// + /// # Return + /// + /// Returns `Some(&HashMap)` containing all cached + /// rotamer energies for the residue, or `None` if no energies are cached. pub fn get_energies_for( &self, residue_id: ResidueId, @@ -45,6 +95,22 @@ impl ELCache { self.data.get(&(residue_id, residue_type)) } + /// Finds the rotamer with the lowest total energy for a specific residue. + /// + /// This method searches through all cached rotamers for the given residue + /// and returns the one with the minimum total energy value. This is useful + /// for identifying the most favorable conformation. + /// + /// # Arguments + /// + /// * `residue_id` - The unique identifier of the residue. + /// * `residue_type` - The type of the residue. + /// + /// # Return + /// + /// Returns `Some((usize, &EnergyTerm))` containing the rotamer index and + /// energy of the ground state, or `None` if no energies are cached for + /// the residue. pub fn find_ground_state_for( &self, residue_id: ResidueId, @@ -52,6 +118,7 @@ impl ELCache { ) -> Option<(usize, &EnergyTerm)> { self.get_energies_for(residue_id, residue_type) .and_then(|energies| { + // Find the rotamer with the lowest total energy by comparing energy values energies .iter() .min_by(|(_, term_a), (_, term_b)| { @@ -64,10 +131,21 @@ impl ELCache { }) } + /// Returns the number of residues for which energies are cached. + /// + /// # Return + /// + /// Returns the count of unique (residue_id, residue_type) combinations + /// that have at least one cached energy term. pub fn len(&self) -> usize { self.data.len() } + /// Checks whether the cache contains any cached energies. + /// + /// # Return + /// + /// Returns `true` if no energies are cached, `false` otherwise. pub fn is_empty(&self) -> bool { self.data.is_empty() } From 1bd922fec75ae4ffecccf24f9329eb6bcc51bc08 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:55:56 -0700 Subject: [PATCH 46/83] docs(engine): Add documentation for EnergyGrid and MoveDelta structs, including detailed descriptions of methods and fields --- crates/scream-core/src/engine/energy_grid.rs | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/crates/scream-core/src/engine/energy_grid.rs b/crates/scream-core/src/engine/energy_grid.rs index 5a06720c..f5c84772 100644 --- a/crates/scream-core/src/engine/energy_grid.rs +++ b/crates/scream-core/src/engine/energy_grid.rs @@ -15,26 +15,70 @@ use tracing::{info, trace}; #[cfg(feature = "parallel")] use rayon::prelude::*; +/// Manages the energy landscape for molecular optimization in SCREAM++. +/// +/// This struct provides a comprehensive energy calculation framework for protein +/// side-chain placement optimization. It maintains pairwise interaction energies +/// between residues, total interaction energies, and individual residue energies +/// to enable efficient delta energy calculations during optimization moves. +/// The energy grid supports both serial and parallel computation modes. #[derive(Debug, Clone)] pub struct EnergyGrid { + /// Pairwise interaction energies between residue pairs. pair_interactions: HashMap<(ResidueId, ResidueId), EnergyTerm>, + /// Total interaction energy for each residue with all other active residues. total_residue_interactions: HashMap, + /// Total interaction energy across all active residue pairs. total_interaction_energy: EnergyTerm, + /// Current electrostatic energies for each active residue. current_el_energies: HashMap, + /// Current total optimization score (interaction + electrostatic). current_optimization_score: f64, } +/// Represents the energy changes resulting from a single optimization move. +/// +/// This struct encapsulates all the energy deltas computed when changing a residue's +/// rotamer conformation. It provides the information needed to update the energy +/// grid efficiently without recomputing all interactions from scratch. #[derive(Debug, Clone)] pub struct MoveDelta { + /// The residue being moved. pub res_id: ResidueId, + /// The new rotamer index for the residue. pub new_rotamer_idx: usize, + /// The new electrostatic energy for the residue. pub new_el: EnergyTerm, + /// The new total interaction energy for the residue. pub new_total_interaction: EnergyTerm, + /// Updated pairwise interaction energies with other residues. pub new_pair_interactions: HashMap, + /// The total change in optimization score from this move. pub delta_score: f64, } impl EnergyGrid { + /// Creates a new energy grid with full energy calculations for the initial system state. + /// + /// This constructor performs a complete energy calculation for all active residues, + /// computing pairwise interactions and caching electrostatic energies. It serves + /// as the baseline for subsequent delta energy calculations during optimization. + /// + /// # Arguments + /// + /// * `system` - The molecular system containing the residues. + /// * `forcefield` - The force field parameters for energy calculations. + /// * `active_residues` - Set of residues that will be optimized. + /// * `el_cache` - Cache of pre-computed electrostatic energies. + /// * `initial_rotamers` - Initial rotamer assignments for active residues. + /// + /// # Return + /// + /// Returns a new `EnergyGrid` instance with computed energies. + /// + /// # Errors + /// + /// Returns `EngineError` if energy calculations fail. pub fn new( system: &MolecularSystem, forcefield: &Forcefield, @@ -83,6 +127,7 @@ impl EnergyGrid { *total_residue_interactions.get_mut(&res_b_id).unwrap() += interaction; } + // The total interaction energy is double-counted in the sum above, so divide by 2 let total_interaction_energy = total_residue_interactions .values() .fold(EnergyTerm::default(), |acc, term| acc + *term) @@ -121,14 +166,53 @@ impl EnergyGrid { }) } + /// Returns the current total optimization score. + /// + /// The optimization score is the sum of all interaction energies and + /// electrostatic energies for the current system configuration. + /// + /// # Return + /// + /// Returns the current total optimization score as a `f64`. pub fn total_score(&self) -> f64 { self.current_optimization_score } + /// Retrieves the current electrostatic energy for a specific residue. + /// + /// # Arguments + /// + /// * `res_id` - The residue ID to query. + /// + /// # Return + /// + /// Returns `Some(&EnergyTerm)` if the residue has cached electrostatic energy, + /// or `None` if the residue is not active or has no cached energy. pub fn get_el_energy(&self, res_id: ResidueId) -> Option<&EnergyTerm> { self.current_el_energies.get(&res_id) } + /// Calculates the energy change resulting from a proposed rotamer move. + /// + /// This method efficiently computes the delta energy without recalculating + /// all interactions. It uses a transaction-based approach to temporarily + /// apply the move and compute only the affected interactions. + /// + /// # Arguments + /// + /// * `res_id` - The residue to move. + /// * `new_rotamer_idx` - The new rotamer index for the residue. + /// * `system_view` - Transaction view for temporary system modifications. + /// * `el_cache` - Cache of pre-computed electrostatic energies. + /// * `active_residues` - Set of all active residues. + /// + /// # Return + /// + /// Returns a `MoveDelta` containing all energy changes from the proposed move. + /// + /// # Errors + /// + /// Returns `EngineError` if the move cannot be applied or energy calculations fail. pub fn calculate_delta_for_move<'a, 'ctx, C>( &self, res_id: ResidueId, @@ -203,6 +287,7 @@ impl EnergyGrid { .map(|term| Some((other_res_id, term))) }; + // Use parallel computation if the feature is enabled, otherwise serial #[cfg(not(feature = "parallel"))] let results: Result, _> = active_residues_vec .iter() @@ -269,6 +354,15 @@ impl EnergyGrid { }) } + /// Applies a pre-calculated move delta to update the energy grid state. + /// + /// This method efficiently updates all internal energy tracking structures + /// using the pre-computed deltas from `calculate_delta_for_move`, avoiding + /// the need for full energy recalculation. + /// + /// # Arguments + /// + /// * `move_delta` - The pre-calculated energy changes from a move. pub fn apply_move(&mut self, move_delta: MoveDelta) { let res_id = move_delta.res_id; From 809f5ee9f2ff69c8f9fcc235d6e54d13956edbaf Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 17:57:41 -0700 Subject: [PATCH 47/83] docs(engine): Add documentation for SystemView struct, including new methods for transactions and rotamer application --- crates/scream-core/src/engine/transaction.rs | 80 ++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/crates/scream-core/src/engine/transaction.rs b/crates/scream-core/src/engine/transaction.rs index 89c19b56..fbb254b6 100644 --- a/crates/scream-core/src/engine/transaction.rs +++ b/crates/scream-core/src/engine/transaction.rs @@ -5,12 +5,21 @@ use crate::core::models::ids::ResidueId; use crate::core::models::system::MolecularSystem; use std::collections::HashMap; +/// Provides a transactional view of the molecular system for optimization operations. +/// +/// This struct enables safe, temporary modifications to the molecular system during +/// optimization moves. It tracks rotamer assignments and provides transaction methods +/// that automatically revert changes after evaluation, ensuring the system remains +/// in a consistent state for energy calculations. pub struct SystemView<'a, 'ctx, C> where C: super::context::ProvidesResidueSelections + Sync, { + /// Mutable reference to the molecular system being modified. pub system: &'a mut MolecularSystem, + /// Reference to the optimization context providing necessary data. pub context: &'ctx OptimizationContext<'ctx, C>, + /// Mutable reference to the current rotamer assignments map. pub current_rotamers: &'a mut HashMap, } @@ -18,6 +27,17 @@ impl<'a, 'ctx, C> SystemView<'a, 'ctx, C> where C: super::context::ProvidesResidueSelections + Sync, { + /// Creates a new system view with the provided components. + /// + /// # Arguments + /// + /// * `system` - Mutable reference to the molecular system. + /// * `context` - Reference to the optimization context. + /// * `current_rotamers` - Mutable reference to the rotamer assignments map. + /// + /// # Return + /// + /// Returns a new `SystemView` instance. pub fn new( system: &'a mut MolecularSystem, context: &'ctx OptimizationContext<'ctx, C>, @@ -30,6 +50,23 @@ where } } + /// Applies a rotamer move to the specified residue. + /// + /// This method updates both the molecular system structure and the rotamer + /// tracking map to reflect the new conformation. + /// + /// # Arguments + /// + /// * `res_id` - The residue to modify. + /// * `new_rotamer_idx` - The index of the new rotamer to apply. + /// + /// # Return + /// + /// Returns `Ok(())` on success. + /// + /// # Errors + /// + /// Returns `EngineError` if the rotamer placement fails. pub fn apply_move( &mut self, res_id: ResidueId, @@ -40,6 +77,10 @@ where Ok(()) } + /// Places a specific rotamer on the given residue in the molecular system. + /// + /// This is an internal method that handles the actual placement of rotamer + /// atoms using the placement module. fn place_rotamer(&mut self, res_id: ResidueId, rotamer_idx: usize) -> Result<(), EngineError> { let residue = self.system.residue(res_id).unwrap(); let res_type = residue.residue_type.unwrap(); @@ -54,6 +95,25 @@ where placement::place_rotamer_on_system(self.system, res_id, rotamer, topology) } + /// Executes an action within a transaction that automatically reverts changes. + /// + /// This method provides transactional semantics for system modifications. The + /// action is executed, and any changes to the specified residue are automatically + /// reverted after the action completes, regardless of success or failure. + /// + /// # Arguments + /// + /// * `res_id_to_modify` - The residue that may be modified during the action. + /// * `action` - A closure that performs the desired operation on the system view. + /// + /// # Return + /// + /// Returns the result of the action closure. + /// + /// # Errors + /// + /// Returns `EngineError` if the residue is not found in the rotamer map or if + /// the action fails. pub fn transaction( &mut self, res_id_to_modify: ResidueId, @@ -89,6 +149,26 @@ where Ok(result) } + /// Executes an action within a transaction for two residues, automatically reverting changes. + /// + /// This method extends the transaction concept to handle simultaneous modifications + /// of two residues. Both residues are tracked and reverted to their original states + /// after the action completes. + /// + /// # Arguments + /// + /// * `res_a_id` - The first residue that may be modified. + /// * `res_b_id` - The second residue that may be modified. + /// * `action` - A closure that performs the desired operation on the system view. + /// + /// # Return + /// + /// Returns the result of the action closure. + /// + /// # Errors + /// + /// Returns `EngineError` if either residue is not found in the rotamer map or if + /// the action fails. pub fn transaction_doublet( &mut self, res_a_id: ResidueId, From 3c808b2d09a64192ab23e1fdbae919e43417f580 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 18:00:31 -0700 Subject: [PATCH 48/83] docs(engine): Add documentation for rotamer placement functions and errors --- crates/scream-core/src/engine/placement.rs | 137 +++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/crates/scream-core/src/engine/placement.rs b/crates/scream-core/src/engine/placement.rs index 89d0ced2..ac2cdc74 100644 --- a/crates/scream-core/src/engine/placement.rs +++ b/crates/scream-core/src/engine/placement.rs @@ -13,25 +13,57 @@ use nalgebra::{Matrix3, Point3, Rotation3, Vector3}; use std::collections::HashMap; use thiserror::Error; +/// Errors that can occur during rotamer placement operations. +/// +/// This enum defines the specific error conditions that may arise when attempting +/// to place a rotamer onto a molecular system, providing detailed information +/// about what went wrong during the placement process. #[derive(Debug, Error)] pub enum PlacementError { + /// An anchor atom required for alignment was not found in the target residue. #[error("Anchor atom '{atom_name}' not found in the target residue in the system")] AnchorAtomNotFoundInSystem { atom_name: String }, + /// An anchor atom required for alignment was not found in the rotamer template. #[error("Anchor atom '{atom_name}' not found in the rotamer template")] AnchorAtomNotFoundInRotamer { atom_name: String }, + /// No atoms with the specified name were found in the rotamer for building the index map. #[error( "Could not find any atoms with name '{atom_name}' in the rotamer to build the index map" )] RotamerAtomNameNotFound { atom_name: String }, + /// Insufficient anchor atoms were available for stable alignment. #[error( "Insufficient anchor atoms for stable alignment: requires at least 3, but found {found}" )] InsufficientAnchors { found: usize }, } +/// Places a rotamer onto a target residue in the molecular system. +/// +/// This function performs a complete rotamer placement operation, which involves +/// three main phases: alignment calculation, side-chain replacement, and topology +/// reconstruction. The rotamer is aligned to the target residue using backbone +/// anchor atoms, then the old side-chain is removed and replaced with the new +/// rotamer's side-chain atoms, and finally the bonding topology is rebuilt. +/// +/// # Arguments +/// +/// * `system` - The molecular system to modify. +/// * `target_residue_id` - The ID of the residue to place the rotamer on. +/// * `rotamer` - The rotamer template to place. +/// * `topology` - The residue topology defining anchor and side-chain atoms. +/// +/// # Return +/// +/// Returns `Ok(())` on successful placement. +/// +/// # Errors +/// +/// Returns `EngineError::Placement` if any step of the placement process fails, +/// with detailed information about the specific error condition. #[inline] pub fn place_rotamer_on_system( system: &mut MolecularSystem, @@ -66,6 +98,27 @@ pub fn place_rotamer_on_system( }) } +/// Calculates the rotation and translation needed to align the rotamer with the target residue. +/// +/// This function determines the optimal rigid-body transformation (rotation and translation) +/// to align the rotamer's backbone atoms with the corresponding atoms in the target residue. +/// The alignment uses the anchor atoms defined in the residue topology to ensure proper +/// positioning of the rotamer relative to the existing molecular structure. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the target residue. +/// * `target_residue_id` - The ID of the residue to align to. +/// * `rotamer` - The rotamer to be aligned. +/// * `topology` - The topology defining which atoms to use for alignment. +/// +/// # Return +/// +/// Returns a tuple of `(Rotation3, Vector3)` representing the rotation and translation. +/// +/// # Errors +/// +/// Returns `PlacementError` if anchor atoms are missing or insufficient for alignment. fn calculate_alignment_transform( system: &MolecularSystem, target_residue_id: ResidueId, @@ -122,6 +175,25 @@ fn calculate_alignment_transform( calculate_transformation(&rotamer_points, &system_points) } +/// Removes all side-chain atoms from the target residue. +/// +/// This function identifies and removes all atoms marked as side-chain atoms +/// from the target residue, preparing it for the placement of a new rotamer's +/// side-chain. Backbone atoms are preserved to maintain the residue's position +/// in the molecular structure. +/// +/// # Arguments +/// +/// * `system` - The molecular system to modify. +/// * `target_residue_id` - The ID of the residue to clean. +/// +/// # Return +/// +/// Returns `Ok(())` after removing all side-chain atoms. +/// +/// # Errors +/// +/// This function does not return errors under normal circumstances. fn remove_old_sidechain( system: &mut MolecularSystem, target_residue_id: ResidueId, @@ -143,6 +215,28 @@ fn remove_old_sidechain( Ok(()) } +/// Adds new side-chain atoms from the rotamer and creates an index-to-ID mapping. +/// +/// This function processes all atoms in the rotamer, applying the calculated +/// transformation to position them correctly. For backbone atoms, it maps to +/// existing system atoms. For side-chain atoms, it creates new atoms with +/// transformed positions and adds them to the system. +/// +/// # Arguments +/// +/// * `system` - The molecular system to modify. +/// * `target_residue_id` - The ID of the residue to add atoms to. +/// * `rotamer` - The rotamer containing the atoms to add. +/// * `rotation` - The rotation transformation to apply. +/// * `translation` - The translation transformation to apply. +/// +/// # Return +/// +/// Returns a `HashMap` mapping rotamer atom indices to system atom IDs. +/// +/// # Errors +/// +/// Returns `PlacementError` if backbone atoms cannot be matched or other issues occur. fn add_new_sidechain_atoms_and_map( system: &mut MolecularSystem, target_residue_id: ResidueId, @@ -202,6 +296,25 @@ fn add_new_sidechain_atoms_and_map( Ok(index_to_id_map) } +/// Rebuilds the bonding topology for the newly placed rotamer. +/// +/// This function recreates all the chemical bonds defined in the rotamer template +/// using the atom ID mapping created during atom placement. Only bonds between +/// atoms that were successfully mapped are created. +/// +/// # Arguments +/// +/// * `system` - The molecular system to modify. +/// * `rotamer` - The rotamer containing the bond definitions. +/// * `index_to_id_map` - Mapping from rotamer atom indices to system atom IDs. +/// +/// # Return +/// +/// Returns `Ok(())` after adding all valid bonds. +/// +/// # Errors +/// +/// This function does not return errors under normal circumstances. fn rebuild_topology( system: &mut MolecularSystem, rotamer: &Rotamer, @@ -217,33 +330,57 @@ fn rebuild_topology( Ok(()) } +/// Calculates the optimal rigid-body transformation between two point sets. +/// +/// This function uses the Kabsch algorithm (via SVD) to find the rotation and +/// translation that best aligns the 'from' points to the 'to' points. It handles +/// the reflection case by ensuring the rotation matrix has a positive determinant. +/// +/// # Arguments +/// +/// * `from_points` - The source point set to transform from. +/// * `to_points` - The target point set to transform to. +/// +/// # Return +/// +/// Returns a tuple of `(Rotation3, Vector3)` representing the optimal rotation and translation. +/// +/// # Errors +/// +/// Returns `PlacementError` if the point sets are incompatible. fn calculate_transformation( from_points: &[Point3], to_points: &[Point3], ) -> Result<(Rotation3, Vector3), PlacementError> { + // Calculate centroids of both point sets let from_centroid_sum: Vector3 = from_points.iter().map(|p| p.coords).sum(); let from_centroid = Point3::from(from_centroid_sum / from_points.len() as f64); let to_centroid_sum: Vector3 = to_points.iter().map(|p| p.coords).sum(); let to_centroid = Point3::from(to_centroid_sum / to_points.len() as f64); + // Center both point sets around their centroids let centered_from: Vec<_> = from_points.iter().map(|p| p - from_centroid).collect(); let centered_to: Vec<_> = to_points.iter().map(|p| p - to_centroid).collect(); + // Compute the covariance matrix H let h = centered_from .iter() .zip(centered_to.iter()) .fold(Matrix3::zeros(), |acc, (f, t)| acc + t * f.transpose()); + // Perform SVD on H to find optimal rotation let svd = h.svd(true, true); let u = svd.u.unwrap(); let v_t = svd.v_t.unwrap(); + // Handle reflection case by correcting the rotation matrix let d = (u * v_t.transpose()).determinant(); let mut correction = Matrix3::identity(); if d < 0.0 { correction[(2, 2)] = -1.0; } + // Compute final rotation matrix and convert to Rotation3 let rotation_matrix = u * correction * v_t; let rotation = Rotation3::from_matrix(&rotation_matrix); let translation = to_centroid.coords - rotation * from_centroid.coords; From 1257890472320c3134bd139b4a73349a00aa296c Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 18:04:29 -0700 Subject: [PATCH 49/83] docs(engine): Documents Boltzmann sampling and errors --- .../scream-core/src/engine/utils/sampling.rs | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/crates/scream-core/src/engine/utils/sampling.rs b/crates/scream-core/src/engine/utils/sampling.rs index f1297860..9516ebb0 100644 --- a/crates/scream-core/src/engine/utils/sampling.rs +++ b/crates/scream-core/src/engine/utils/sampling.rs @@ -2,16 +2,30 @@ use rand::{distributions::WeightedIndex, prelude::*}; use thiserror::Error; use tracing::instrument; +/// Defines errors that can occur during sampling operations. +/// +/// This enum represents various failure modes in the sampling utilities, +/// particularly for Boltzmann-weighted sampling of energy landscapes. +/// Each variant provides specific information about the nature of the error. #[derive(Debug, Error)] pub enum SamplingError { + /// Indicates that the input energies list is empty, preventing any sampling operation. #[error("Input energies list is empty, cannot perform sampling")] EmptyEnergies, + /// Occurs when all Boltzmann weights sum to effectively zero, typically due to + /// very low temperatures (high beta) or extremely large energy differences. #[error( "All energies are too high or beta is zero, resulting in zero total weight for sampling" )] ZeroTotalWeight, + /// Signals that the provided beta value is invalid for Boltzmann sampling. + /// + /// Beta must be positive as it represents the inverse temperature parameter. #[error("Invalid beta value: {0}. Beta must be positive for Boltzmann sampling")] InvalidBeta(f64), + /// Wraps errors from the underlying random sampling distribution creation. + /// + /// This typically occurs when weights contain invalid values (NaN, negative, etc.). #[error("Failed to create weighted distribution: {source}")] DistributionError { #[from] @@ -19,6 +33,32 @@ pub enum SamplingError { }, } +/// Performs Boltzmann-weighted sampling from a list of energies. +/// +/// This function implements the Boltzmann distribution sampling algorithm commonly used +/// in molecular simulations and optimization. It converts energy values to probabilities +/// using the formula P(i) ∝ exp(-β(E_i - E_min)), where β is the inverse temperature +/// parameter and E_min is the minimum energy in the set. +/// +/// The algorithm handles numerical underflow by detecting when total weights become +/// negligibly small and falling back to selecting the minimum energy state. +/// +/// # Arguments +/// +/// * `energies` - A slice of energy values to sample from. Must not be empty. +/// * `beta` - The inverse temperature parameter (β = 1/kT). Must be positive. +/// * `rng` - A mutable random number generator implementing the `Rng` trait. +/// +/// # Return +/// +/// Returns the index of the selected energy state, or a `SamplingError` if sampling fails. +/// +/// # Errors +/// +/// Returns `SamplingError::EmptyEnergies` if the energies slice is empty. +/// Returns `SamplingError::InvalidBeta` if beta is not positive. +/// Returns `SamplingError::ZeroTotalWeight` if all weights underflow to zero. +/// Returns `SamplingError::DistributionError` if weight distribution creation fails. #[instrument(level = "trace", skip_all, fields(beta))] pub fn boltzmann_sample( energies: &[f64], @@ -32,22 +72,26 @@ pub fn boltzmann_sample( return Err(SamplingError::InvalidBeta(beta)); } + // Find the minimum energy to improve numerical stability let min_energy = *energies .iter() .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) .unwrap(); + // Compute Boltzmann weights: exp(-β(E_i - E_min)) let weights: Vec = energies .iter() .map(|&e| (-(e - min_energy) * beta).exp()) .collect(); let total_weight: f64 = weights.iter().sum(); + // Handle numerical underflow by checking if total weight is effectively zero if total_weight <= f64::EPSILON { tracing::warn!( "Total Boltzmann weight is near zero ({}). This might indicate a very low temperature or large energy differences, leading to numerical underflow. Returning first index as fallback.", total_weight ); + // Fallback: return the index of the minimum energy state if let Some(idx) = energies .iter() .position(|&e| (e - min_energy).abs() < f64::EPSILON) From 7370a36e20c76b92407afb7c43507c9a0e8ace35 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 18:08:34 -0700 Subject: [PATCH 50/83] docs(engine): Add documentation for sidechain and environment atom functions --- crates/scream-core/src/engine/utils/query.rs | 75 ++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/crates/scream-core/src/engine/utils/query.rs b/crates/scream-core/src/engine/utils/query.rs index 20d78501..2788f94b 100644 --- a/crates/scream-core/src/engine/utils/query.rs +++ b/crates/scream-core/src/engine/utils/query.rs @@ -10,6 +10,20 @@ use std::collections::{HashMap, HashSet}; #[cfg(feature = "parallel")] use rayon::prelude::*; +/// Collects all sidechain atoms for a set of active residues. +/// +/// This function iterates through the specified active residues and extracts +/// all atoms classified as sidechain atoms according to their role in the molecular system. +/// The result is a mapping from residue IDs to their corresponding sidechain atom IDs. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the residues and atoms. +/// * `active_residues` - A set of residue IDs for which to collect sidechain atoms. +/// +/// # Return +/// +/// Returns a `HashMap` where keys are residue IDs and values are vectors of sidechain atom IDs. pub fn collect_active_sidechain_atoms( system: &MolecularSystem, active_residues: &HashSet, @@ -36,6 +50,20 @@ pub fn collect_active_sidechain_atoms( map } +/// Precomputes the set of environment atoms for optimization. +/// +/// Environment atoms are those that are not part of the active residues' sidechains +/// but may interact with them during energy calculations. This includes backbone atoms +/// from active residues and all atoms from inactive residues. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing all atoms. +/// * `active_residues` - A set of residue IDs that are being optimized. +/// +/// # Return +/// +/// Returns a vector of atom IDs representing the environment atoms. pub fn precompute_environment_atoms( system: &MolecularSystem, active_residues: &HashSet, @@ -54,6 +82,27 @@ pub fn precompute_environment_atoms( .collect() } +/// Resolves a residue selection specification into a set of residue IDs. +/// +/// This function interprets various types of residue selection criteria and converts +/// them into concrete residue IDs from the molecular system. It supports selecting +/// all residues, explicit include/exclude lists, and ligand binding site detection. +/// The final result is filtered to only include residues that have available rotamers +/// in the provided library. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the residues to select from. +/// * `selection` - The selection criteria specifying which residues to include. +/// * `library` - The rotamer library used to filter residues by rotamer availability. +/// +/// # Return +/// +/// Returns a set of residue IDs that match the selection criteria and have available rotamers. +/// +/// # Errors +/// +/// Returns `EngineError::ResidueNotFound` if a specified residue cannot be found in the system. pub fn resolve_selection_to_ids( system: &MolecularSystem, selection: &ResidueSelection, @@ -63,12 +112,16 @@ pub fn resolve_selection_to_ids( match selection { ResidueSelection::All => { + // Select all residues in the system candidate_ids = system.residues_iter().map(|(id, _)| id).collect(); } ResidueSelection::List { include, exclude } => { + // Handle explicit include/exclude lists if include.is_empty() && !exclude.is_empty() { + // If only exclusions specified, start with all residues candidate_ids = system.residues_iter().map(|(id, _)| id).collect(); } else { + // Add explicitly included residues for spec in include { let chain_id = system .find_chain_by_id(spec.chain_id) @@ -80,6 +133,7 @@ pub fn resolve_selection_to_ids( } } + // Remove explicitly excluded residues for spec in exclude { if let Some(chain_id) = system.find_chain_by_id(spec.chain_id) { if let Some(residue_id) = @@ -94,6 +148,7 @@ pub fn resolve_selection_to_ids( ligand_residue, radius_angstroms, } => { + // Find residues within binding distance of the ligand let ligand_chain_id = system .find_chain_by_id(ligand_residue.chain_id) .ok_or_else(|| EngineError::ResidueNotFound { @@ -105,6 +160,7 @@ pub fn resolve_selection_to_ids( spec: ligand_residue.clone(), })?; + // Collect heavy atom positions from the ligand let mut ligand_heavy_atom_positions: Vec<[f64; 3]> = Vec::new(); if let Some(ligand_res) = system.residue(ligand_res_id) { for atom_id in ligand_res.atoms() { @@ -124,15 +180,18 @@ pub fn resolve_selection_to_ids( return Ok(HashSet::new()); } + // Build spatial index for efficient distance queries let kdtree: KdTree = (&ligand_heavy_atom_positions).into(); let radius_sq = radius_angstroms * radius_angstroms; + // Use parallel iteration if available #[cfg(not(feature = "parallel"))] let iterator = system.residues_iter(); #[cfg(feature = "parallel")] let iterator = system.residues_iter().par_bridge(); + // Find protein residues within the specified radius let binding_site_ids: HashSet = iterator .filter_map(|(res_id, residue)| { if res_id == ligand_res_id { @@ -146,6 +205,7 @@ pub fn resolve_selection_to_ids( return None; } + // Check if any heavy atom of this residue is within radius let is_in_binding_site = residue.atoms().iter().any(|protein_atom_id| { if let Some(protein_atom) = system.atom(*protein_atom_id) { if is_heavy_atom(&protein_atom.name) { @@ -173,6 +233,7 @@ pub fn resolve_selection_to_ids( } }; + // Filter to only include residues with available rotamers let final_active_residues = candidate_ids .into_iter() .filter(|&residue_id| { @@ -188,6 +249,20 @@ pub fn resolve_selection_to_ids( Ok(final_active_residues) } +/// Determines whether an atom is a heavy atom (non-hydrogen). +/// +/// Heavy atoms are defined as any atom whose name does not start with 'H' or 'D' +/// (deuterium). This classification is used in various molecular calculations +/// where hydrogen atoms are often treated differently due to their small size +/// and different interaction properties. +/// +/// # Arguments +/// +/// * `atom_name` - The name of the atom to classify. +/// +/// # Return +/// +/// Returns `true` if the atom is a heavy atom, `false` if it is hydrogen or deuterium. fn is_heavy_atom(atom_name: &str) -> bool { let first_char = atom_name .trim() From 3bd3bf8a89c0a0a0847ea8966a66ae9ac940e597 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 18:08:49 -0700 Subject: [PATCH 51/83] docs(engine): Add documentation for utility functions in the engine --- crates/scream-core/src/engine/utils/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/scream-core/src/engine/utils/mod.rs b/crates/scream-core/src/engine/utils/mod.rs index 63f42945..da6a5b2e 100644 --- a/crates/scream-core/src/engine/utils/mod.rs +++ b/crates/scream-core/src/engine/utils/mod.rs @@ -1,2 +1,8 @@ +//! Utility functions for the engine module. +//! +//! This module provides various utility functions that support the core engine operations, +//! including residue selection and querying, sampling algorithms, and other helper functions +//! used throughout the optimization and placement processes. + pub mod query; pub mod sampling; From 860341b17bb7e821d1185d02a7bca50c8fd5b5aa Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 19:51:45 -0700 Subject: [PATCH 52/83] docs(engine): Add documentation for ClashPair struct and clash detection function --- .../src/engine/tasks/clash_detection.rs | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/crates/scream-core/src/engine/tasks/clash_detection.rs b/crates/scream-core/src/engine/tasks/clash_detection.rs index 2c9a9b10..3abb0a16 100644 --- a/crates/scream-core/src/engine/tasks/clash_detection.rs +++ b/crates/scream-core/src/engine/tasks/clash_detection.rs @@ -12,10 +12,21 @@ use tracing::{info, instrument}; #[cfg(feature = "parallel")] use rayon::prelude::*; +/// Represents a pair of residues that are clashing with each other. +/// +/// A clash occurs when the interaction energy between two residues exceeds +/// a specified threshold, indicating unfavorable steric or electrostatic interactions. +/// This struct stores the residue IDs and the computed energy of the clash. #[derive(Debug, Clone, PartialEq)] pub struct ClashPair { + /// The ID of the first residue in the clashing pair. pub residue_a: ResidueId, + /// The ID of the second residue in the clashing pair. pub residue_b: ResidueId, + /// The computed interaction energy between the two residues. + /// + /// This includes all relevant energy terms (van der Waals, electrostatic, etc.) + /// as calculated by the forcefield scorer. pub energy: EnergyTerm, } @@ -33,6 +44,30 @@ impl Ord for ClashPair { } } +/// Detects steric and energetic clashes between pairs of active residues. +/// +/// This function performs pairwise clash detection by computing interaction energies +/// between all combinations of active residues. Residue pairs with energies exceeding +/// the specified threshold are identified as clashes. The algorithm uses parallel +/// processing when available to improve performance on large systems. +/// +/// The results are sorted by energy in descending order (most severe clashes first), +/// making it easy to prioritize the most problematic interactions for resolution. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the residues to analyze. +/// * `forcefield` - The forcefield parameters used for energy calculations. +/// * `active_residues` - The set of residue IDs to check for clashes. +/// * `clash_threshold_kcal_mol` - The energy threshold above which interactions are considered clashes. +/// +/// # Return +/// +/// Returns a vector of `ClashPair` structs representing detected clashes, sorted by energy descending. +/// +/// # Errors +/// +/// Returns `EngineError` if energy calculation fails for any residue pair. #[instrument(skip_all, name = "clash_detection_task")] pub fn run( system: &MolecularSystem, @@ -45,6 +80,7 @@ pub fn run( "Detecting residue clashes." ); + // Generate all unique pairs of active residues for comparison let residue_pairs: Vec<_> = active_residues.iter().combinations(2).collect(); if residue_pairs.is_empty() { @@ -53,6 +89,7 @@ pub fn run( let scorer = Scorer::new(system, forcefield); + // Use parallel iteration if the "parallel" feature is enabled #[cfg(not(feature = "parallel"))] let iterator = residue_pairs.iter(); @@ -81,6 +118,7 @@ pub fn run( let mut clashes = clashes?; + // Sort clashes by energy in descending order (most severe first) clashes.sort_unstable(); info!(num_clashes = clashes.len(), "Clash detection complete."); From 347ba0cd033aad245bbb3183989ce83e5363c522 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 19:54:37 -0700 Subject: [PATCH 53/83] docs(engine): Add documentation for doublet optimization and add error handling details --- .../src/engine/tasks/doublet_optimization.rs | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/crates/scream-core/src/engine/tasks/doublet_optimization.rs b/crates/scream-core/src/engine/tasks/doublet_optimization.rs index 5a672a18..2f7668b3 100644 --- a/crates/scream-core/src/engine/tasks/doublet_optimization.rs +++ b/crates/scream-core/src/engine/tasks/doublet_optimization.rs @@ -14,13 +14,55 @@ use tracing::{debug, instrument}; #[cfg(feature = "parallel")] use rayon::prelude::*; +/// Represents the result of optimizing a pair of residues (doublet) simultaneously. +/// +/// This struct contains the optimal rotamer indices for both residues in the pair +/// along with the computed local energy of the optimized configuration. The local +/// energy includes interactions between the two residues and with other active residues +/// in the system, but excludes the individual rotamer self-energies. #[derive(Debug, Clone, Copy)] pub struct DoubletResult { + /// The index of the optimal rotamer for the first residue in the pair. pub rotamer_idx_a: usize, + /// The index of the optimal rotamer for the second residue in the pair. pub rotamer_idx_b: usize, + /// The computed local interaction energy for the optimal rotamer pair. + /// + /// This includes the interaction energy between residues A and B, plus + /// interactions of each residue with all other active residues in the system. pub best_local_energy: f64, } +/// Performs simultaneous optimization of two residues (doublet optimization). +/// +/// This function optimizes the rotamer configurations of two residues simultaneously +/// by evaluating all possible combinations of their rotamers. It uses a brute-force +/// approach with early termination heuristics to find the optimal pair that minimizes +/// the total interaction energy with other active residues in the system. +/// +/// The algorithm employs parallel processing when available and uses cached individual +/// rotamer energies to avoid recomputation. Progress is reported through the provided +/// reporter for long-running optimizations. +/// +/// # Arguments +/// +/// * `res_a_id` - The ID of the first residue in the doublet. +/// * `res_b_id` - The ID of the second residue in the doublet. +/// * `system` - The molecular system containing the residues. +/// * `el_cache` - Cache of precomputed individual rotamer energies. +/// * `context` - The optimization context containing forcefield and rotamer library. +/// * `active_residues` - Set of all active residue IDs in the optimization. +/// * `reporter` - Progress reporter for tracking optimization progress. +/// +/// # Return +/// +/// Returns a `DoubletResult` containing the optimal rotamer indices and local energy. +/// +/// # Errors +/// +/// Returns `EngineError::Internal` if residues are not found or not standard types. +/// Returns `EngineError::RotamerLibrary` if rotamers are not available for the residues. +/// Returns `EngineError::PhaseFailed` if no valid rotamer pairs can be evaluated. #[instrument(skip_all, name = "doublet_optimization_task", fields(res_a = ?res_a_id, res_b = ?res_b_id))] pub fn run( res_a_id: ResidueId, @@ -35,6 +77,7 @@ where C: ProvidesResidueSelections + Sync, { // --- 1. Prepare data --- + // Validate and extract residue information let residue_a = system.residue(res_a_id).ok_or_else(|| { EngineError::Internal(format!("Residue {:?} not found in system", res_a_id)) })?; @@ -55,6 +98,7 @@ where )) })?; + // Retrieve rotamer libraries for both residues let rotamers_a = context .rotamer_library .get_rotamers_for(res_type_a) @@ -81,6 +125,7 @@ where }); } + // Generate all possible rotamer combinations let index_pairs: Vec<(usize, usize)> = (0..rotamers_a.len()) .flat_map(|i| (0..rotamers_b.len()).map(move |j| (i, j))) .collect(); @@ -99,6 +144,7 @@ where let processed_count = AtomicUsize::new(0); const REPORT_INTERVAL: usize = 256; + // Collect IDs of other active residues for interaction calculations let other_active_residue_ids: Vec = active_residues .iter() .filter(|&&id| id != res_a_id && id != res_b_id) @@ -109,10 +155,12 @@ where let best_pair_result = { #[cfg(feature = "parallel")] { + // Parallel evaluation using rayon index_pairs .par_iter() .fold( || -> Result<(Option, MolecularSystem, HashMap, &OptimizationContext, f64), EngineError> { + // Initialize thread-local state let thread_local_system = system.clone(); let mut thread_local_rotamers = HashMap::new(); thread_local_rotamers.insert(res_a_id, 0); @@ -134,6 +182,7 @@ where reporter.report(Progress::TaskIncrement { amount: REPORT_INTERVAL as u64 }); } + // Early termination heuristic: skip if individual energies are too high const MAX_FAVORABLE_INTERACTION: f64 = -20.0; let el_a = el_cache @@ -150,6 +199,7 @@ where return acc; } + // Evaluate rotamer pair using transactional system view let mut system_view = SystemView::new(thread_system, thread_context, thread_rotamers); match system_view.transaction_doublet(res_a_id, res_b_id, |view| { @@ -157,6 +207,8 @@ where view.apply_move(res_b_id, idx_b)?; let scorer = Scorer::new(view.system, thread_context.forcefield); + + // Helper to extract sidechain atoms let get_sc_atoms = |sys: &MolecularSystem, res_id: ResidueId| -> Vec { sys.residue(res_id) .unwrap() @@ -181,6 +233,7 @@ where .flat_map(|&id| get_sc_atoms(view.system, id)) .collect(); + // Calculate all relevant interaction energies let inter_ab = scorer.score_interaction(&atoms_a_sc, &atoms_b_sc)?; let inter_a_others = scorer.score_interaction(&atoms_a_sc, &other_active_sc_atoms)?; let inter_b_others = scorer.score_interaction(&atoms_b_sc, &other_active_sc_atoms)?; @@ -217,6 +270,7 @@ where #[cfg(not(feature = "parallel"))] { + // Sequential evaluation let mut best_result: Option = None; let mut best_energy: f64 = f64::MAX; @@ -228,6 +282,7 @@ where }); } + // Early termination heuristic const MAX_FAVORABLE_INTERACTION: f64 = -20.0; let el_a = el_cache @@ -244,6 +299,7 @@ where continue; } + // Evaluate rotamer pair let mut thread_system = system.clone(); let mut thread_rotamers = HashMap::new(); thread_rotamers.insert(res_a_id, 0); @@ -257,6 +313,8 @@ where view.apply_move(res_b_id, idx_b)?; let scorer = Scorer::new(view.system, context.forcefield); + + // Helper to extract sidechain atoms let get_sc_atoms = |sys: &MolecularSystem, res_id: ResidueId| -> Vec { sys.residue(res_id) .unwrap() @@ -281,6 +339,7 @@ where .flat_map(|&id| get_sc_atoms(view.system, id)) .collect(); + // Calculate interaction energies let inter_ab = scorer.score_interaction(&atoms_a_sc, &atoms_b_sc)?; let inter_a_others = scorer.score_interaction(&atoms_a_sc, &other_active_sc_atoms)?; @@ -311,6 +370,7 @@ where } }; + // Report final progress let final_count = processed_count.load(Ordering::Relaxed); if final_count > 0 && final_count % REPORT_INTERVAL != 0 { reporter.report(Progress::TaskIncrement { From 5ce110fb3d8f88cfc2001842381b574ca3450425 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 19:57:50 -0700 Subject: [PATCH 54/83] docs(engine): Add documentation for EL energy computation functions and work units --- .../scream-core/src/engine/tasks/el_energy.rs | 112 +++++++++++++++++- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/crates/scream-core/src/engine/tasks/el_energy.rs b/crates/scream-core/src/engine/tasks/el_energy.rs index aad6a4e5..8c0e1e38 100644 --- a/crates/scream-core/src/engine/tasks/el_energy.rs +++ b/crates/scream-core/src/engine/tasks/el_energy.rs @@ -17,14 +17,48 @@ use tracing::{info, instrument, warn}; #[cfg(feature = "parallel")] use rayon::prelude::*; +/// Represents a unit of work for EL energy computation. +/// +/// Each work unit corresponds to a specific residue and residue type combination +/// that needs to have its rotamer energies precomputed and cached. #[derive(Debug)] struct WorkUnit { + /// The ID of the residue to process. residue_id: ResidueId, + /// The residue type for which to compute energies. residue_type: ResidueType, } +/// Type alias for the result of processing a work unit. +/// +/// Contains either the computed energy map for a residue-type combination +/// or an error that occurred during computation. type WorkResult = Result<((ResidueId, ResidueType), HashMap), EngineError>; +/// Precomputes Empty Lattice (EL) energies for all rotamers of active residues. +/// +/// This function performs the computationally intensive task of calculating interaction +/// energies between each rotamer of each active residue and the fixed molecular environment. +/// The results are cached to avoid recomputation during optimization. This is a key +/// preprocessing step that enables efficient side-chain placement algorithms. +/// +/// The EL energy for each rotamer includes: +/// - Interactions with all non-active atoms (backbone and other residues) +/// - Internal non-bonded energies within the rotamer itself +/// +/// The computation can be parallelized when the "parallel" feature is enabled. +/// +/// # Arguments +/// +/// * `context` - The optimization context containing system, forcefield, and configuration. +/// +/// # Return +/// +/// Returns an `ELCache` containing precomputed energies for all rotamer combinations. +/// +/// # Errors +/// +/// Returns `EngineError` if work list construction fails or energy calculations encounter errors. #[instrument(skip_all, name = "el_energy_cache_generation_task")] pub fn run(context: &OptimizationContext) -> Result where @@ -35,6 +69,7 @@ where name: "EL Pre-computation", }); + // Build the list of work units (residue-type combinations to process) let work_list = build_work_list(context)?; if work_list.is_empty() { @@ -43,6 +78,7 @@ where return Ok(ELCache::new()); } + // Precompute environment atoms for efficient energy calculations let active_residue_ids = context.resolve_all_active_residues()?; let environment_atom_ids = precompute_environment_atoms(context.system, &active_residue_ids); @@ -50,12 +86,14 @@ where total: work_list.len() as u64, }); + // Use parallel processing if available #[cfg(not(feature = "parallel"))] let iterator = work_list.iter(); #[cfg(feature = "parallel")] let iterator = work_list.par_iter(); + // Process each work unit to compute rotamer energies let results: Vec = iterator .map(|unit| { #[cfg(feature = "parallel")] @@ -74,6 +112,7 @@ where context.reporter.report(Progress::TaskFinish); + // Populate the cache with computed energies let mut cache = ELCache::new(); for result in results { let ((residue_id, residue_type), energy_map) = result?; @@ -90,6 +129,29 @@ where Ok(cache) } +/// Calculates the current total Empty Lattice energy for all active sidechains. +/// +/// This function computes the EL energy of the current molecular conformation by +/// summing the interaction energies between each active sidechain and the molecular +/// environment, plus the internal energies within each sidechain. This provides +/// a baseline energy measurement for the current state. +/// +/// The calculation includes: +/// - Interactions between each sidechain and all environment atoms +/// - Internal non-bonded energies within each sidechain +/// - Sum across all active residues +/// +/// # Arguments +/// +/// * `context` - The optimization context containing the current system state. +/// +/// # Return +/// +/// Returns the total EL energy as an `EnergyTerm`. +/// +/// # Errors +/// +/// Returns `EngineError` if energy calculations fail for any residue. #[instrument(skip_all, name = "current_el_energy_task")] pub fn calculate_current(context: &OptimizationContext) -> Result where @@ -109,6 +171,7 @@ where let mut total_el_energy = EnergyTerm::default(); for residue_id in active_residues { + // Extract sidechain atoms for this residue let sidechain_atoms: Vec = context .system .residue(residue_id) @@ -130,14 +193,14 @@ where continue; } - // 1. Interaction with fixed environment for THIS sidechain + // Calculate interaction with environment atoms let interaction_energy = scorer.score_interaction(&sidechain_atoms, &environment_atom_ids)?; - // 2. Internal non-bonded energy for THIS sidechain + // Calculate internal energy within the sidechain let internal_energy = scorer.score_group_internal(&sidechain_atoms)?; - // 3. Add this residue's complete EL energy to the total + // Add this residue's complete EL energy to the total total_el_energy += interaction_energy + internal_energy; } @@ -148,6 +211,23 @@ where Ok(total_el_energy) } +/// Builds the list of work units for EL energy computation. +/// +/// This function determines which residue-type combinations need to have their +/// rotamer energies precomputed. It considers both design sites (where multiple +/// residue types are allowed) and native residues that have available rotamers. +/// +/// # Arguments +/// +/// * `context` - The optimization context containing configuration and system data. +/// +/// # Return +/// +/// Returns a vector of `WorkUnit` structs representing the computation tasks. +/// +/// # Errors +/// +/// Returns `EngineError` if residue resolution fails. fn build_work_list(context: &OptimizationContext) -> Result, EngineError> where C: ProvidesResidueSelections + Sync, @@ -158,6 +238,7 @@ where for &residue_id in &active_residues { let residue = context.system.residue(residue_id).unwrap(); + // Check if this is a design site with multiple allowed types let mut is_design_site = false; if let Some(design_spec) = context.config.design_spec() { let chain = context.system.chain(residue.chain_id).unwrap(); @@ -180,6 +261,7 @@ where } } + // If not a design site, use the native residue type if !is_design_site { if let Some(native_type) = residue.residue_type { if context @@ -198,6 +280,22 @@ where Ok(work_list) } +/// Computes EL energies for all rotamers of a specific residue-type combination. +/// +/// This function places each available rotamer for the given residue and computes +/// its interaction energy with the molecular environment plus its internal energy. +/// The results are collected into an energy map for caching. +/// +/// # Arguments +/// +/// * `unit` - The work unit specifying residue and type to process. +/// * `environment_atom_ids` - Precomputed list of environment atom IDs. +/// * `context` - The optimization context containing forcefield and topology. +/// * `system` - Mutable reference to the molecular system for rotamer placement. +/// +/// # Return +/// +/// Returns a `WorkResult` containing the computed energy map or an error. #[instrument(skip_all, fields(residue_id = ?unit.residue_id, residue_type = %unit.residue_type))] fn compute_energies_for_unit( unit: &WorkUnit, @@ -208,6 +306,7 @@ fn compute_energies_for_unit( where C: ProvidesResidueSelections + Sync, { + // Retrieve rotamers for this residue type let rotamers = context .rotamer_library .get_rotamers_for(unit.residue_type) @@ -216,6 +315,7 @@ where message: "No rotamers found for this residue type during EL calculation.".to_string(), })?; + // Get topology for rotamer placement let residue_name = unit.residue_type.to_three_letter(); let topology = context.topology_registry.get(residue_name).ok_or_else(|| { EngineError::TopologyNotFound { @@ -225,9 +325,12 @@ where let mut energy_map = HashMap::with_capacity(rotamers.len()); + // Process each rotamer for (rotamer_idx, rotamer) in rotamers.iter().enumerate() { + // Place the rotamer on the system place_rotamer_on_system(system, unit.residue_id, rotamer, topology)?; + // Extract sidechain atoms for energy calculation let query_atoms: Vec = system .residue(unit.residue_id) .unwrap() @@ -251,7 +354,9 @@ where let scorer = Scorer::new(system, context.forcefield); + // Calculate interaction with environment let interaction_energy = scorer.score_interaction(&query_atoms, environment_atom_ids)?; + // Calculate internal energy within rotamer let internal_energy = scorer.score_group_internal(&query_atoms)?; let total_el_energy = interaction_energy + internal_energy; @@ -259,6 +364,7 @@ where energy_map.insert(rotamer_idx, total_el_energy); } + // Report progress context .reporter .report(Progress::TaskIncrement { amount: 1 }); From 2526841624c30725f321652293579600322db259 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:01:01 -0700 Subject: [PATCH 55/83] docs(engine): Add documentation for fixed energy calculations and fixed atom ID collection --- .../src/engine/tasks/fixed_energy.rs | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/crates/scream-core/src/engine/tasks/fixed_energy.rs b/crates/scream-core/src/engine/tasks/fixed_energy.rs index a32e13e3..33920caf 100644 --- a/crates/scream-core/src/engine/tasks/fixed_energy.rs +++ b/crates/scream-core/src/engine/tasks/fixed_energy.rs @@ -6,6 +6,26 @@ use crate::engine::context::{OptimizationContext, ProvidesResidueSelections}; use crate::engine::error::EngineError; use tracing::{info, instrument}; +/// Calculates the constant energy offset for the fixed parts of the molecular system. +/// +/// This function computes the internal non-bonded energy of all atoms that are not part of +/// the active residues being optimized. This energy offset remains constant throughout +/// the optimization process and is used to normalize total energy calculations by providing +/// a baseline energy for the fixed structural elements. +/// +/// # Arguments +/// +/// * `context` - The optimization context containing the molecular system, forcefield parameters, and configuration for residue selections. +/// +/// # Return +/// +/// Returns an `EnergyTerm` representing the total internal energy of all fixed atoms, +/// including van der Waals, Coulomb, and hydrogen bonding contributions. +/// +/// # Errors +/// +/// Returns `EngineError` if there are issues resolving the active residues or if energy +/// scoring fails due to invalid system state or forcefield parameters. #[instrument(skip_all, name = "fixed_energy_task")] pub fn run(context: &OptimizationContext) -> Result where @@ -37,12 +57,30 @@ where Ok(total_offset_energy) } +/// Collects all atom IDs that are considered fixed during optimization. +/// +/// Fixed atoms include those in inactive residues and backbone atoms in active residues. +/// These atoms do not change during side-chain placement and their internal energy +/// is precomputed as a constant offset to avoid redundant calculations. +/// +/// # Arguments +/// +/// * `context` - The optimization context providing access to the molecular system and residue selection configuration. +/// +/// # Return +/// +/// A vector containing the IDs of all fixed atoms in the system. +/// +/// # Errors +/// +/// Returns `EngineError` if the active residues cannot be resolved from the context. fn collect_fixed_atom_ids(context: &OptimizationContext) -> Result, EngineError> where C: ProvidesResidueSelections + Sync, { let active_residues = context.resolve_all_active_residues()?; + // Collect atoms that are either in inactive residues or are backbone atoms in active residues. let fixed_atom_ids = context .system .atoms_iter() From 7c6a82e15b540f702d4a2e5bd21bad886d89e3e1 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:03:19 -0700 Subject: [PATCH 56/83] docs(engine): Add documentation for interaction energy calculation and add thread-safety note for parallel mode --- .../src/engine/tasks/interaction_energy.rs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/scream-core/src/engine/tasks/interaction_energy.rs b/crates/scream-core/src/engine/tasks/interaction_energy.rs index c0dcfb7b..7ab50bef 100644 --- a/crates/scream-core/src/engine/tasks/interaction_energy.rs +++ b/crates/scream-core/src/engine/tasks/interaction_energy.rs @@ -13,6 +13,26 @@ use rayon::prelude::*; use std::sync::Mutex; use tracing::instrument; +/// Calculates the total interaction energy between sidechains of active residues. +/// +/// This function computes the non-bonded interaction energy (van der Waals, Coulomb, hydrogen bonding) +/// between all pairs of sidechain atoms from active residues. It uses an efficient pairwise +/// approach and supports both serial and parallel computation modes depending on compilation features. +/// +/// # Arguments +/// +/// * `system` - The molecular system containing the residues and atoms. +/// * `forcefield` - The forcefield parameters used for energy calculations. +/// * `active_residues` - A set of residue IDs that are currently being optimized. +/// +/// # Return +/// +/// Returns an `EnergyTerm` representing the total interaction energy between all active residue pairs. +/// If there are fewer than 2 active residues, returns zero energy. +/// +/// # Errors +/// +/// Returns `EngineError` if energy scoring fails due to invalid system state or forcefield parameters. #[instrument(skip_all, name = "interaction_energy_task")] pub fn run( system: &MolecularSystem, @@ -64,6 +84,7 @@ pub fn run( #[cfg(feature = "parallel")] let total_interaction_energy = { + // Use a mutex-protected accumulator for thread-safe energy summation in parallel mode. let acc = Mutex::new(EnergyTerm::default()); iterator From ae54fa88595ca59c7e21fb442ad74beb42e477fe Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:06:32 -0700 Subject: [PATCH 57/83] docs(engine): Add module documentation for tasks related to side-chain placement optimization and energy calculations --- crates/scream-core/src/engine/tasks/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/scream-core/src/engine/tasks/mod.rs b/crates/scream-core/src/engine/tasks/mod.rs index f3578e12..185864a2 100644 --- a/crates/scream-core/src/engine/tasks/mod.rs +++ b/crates/scream-core/src/engine/tasks/mod.rs @@ -1,3 +1,10 @@ +//! Tasks for side-chain placement optimization and energy calculations. +//! +//! Tasks are the core computational units that perform specific calculations or optimizations +//! during protein side-chain placement. Each submodule implements a different type of task, +//! such as energy calculations, optimization algorithms, and clash detection. Tasks are +//! designed to be modular and composable, allowing for flexible optimization workflows. + pub mod clash_detection; pub mod doublet_optimization; pub mod el_energy; From b69423ef5ebc491052d7635d974a8081d0b0a834 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:17:27 -0700 Subject: [PATCH 58/83] docs(engine): Add module documentation with detailed architecture and key capabilities --- crates/scream-core/src/core/mod.rs | 10 -------- crates/scream-core/src/engine/mod.rs | 34 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/crates/scream-core/src/core/mod.rs b/crates/scream-core/src/core/mod.rs index 10de6856..adeec92a 100644 --- a/crates/scream-core/src/core/mod.rs +++ b/crates/scream-core/src/core/mod.rs @@ -30,16 +30,6 @@ //! - **Rotamer library management** for side-chain conformation sampling //! - **Topology-aware atom classification** for backbone/sidechain distinction //! - **Extensible force field support** for different parameter sets -//! -//! ## Scientific Foundation -//! -//! The core module implements algorithms based on established computational chemistry -//! principles: -//! -//! - **Molecular mechanics** for energy minimization and conformational analysis -//! - **Rotamer libraries** derived from statistical analysis of protein structures -//! - **Force field methods** including Lennard-Jones, Coulomb, and hydrogen bonding potentials -//! - **Topology-based modeling** respecting molecular connectivity and stereochemistry pub mod forcefield; pub mod io; diff --git a/crates/scream-core/src/engine/mod.rs b/crates/scream-core/src/engine/mod.rs index 04dcd2ce..b3f596cd 100644 --- a/crates/scream-core/src/engine/mod.rs +++ b/crates/scream-core/src/engine/mod.rs @@ -1,3 +1,37 @@ +//! # Engine Module +//! +//! This module implements the optimization engine for automated protein side-chain placement +//! in SCREAM++, providing the computational framework for conformational optimization and +//! energy minimization workflows. +//! +//! ## Overview +//! +//! The engine module orchestrates the complete optimization process for protein side-chain +//! conformation prediction. It manages optimization state, coordinates computational tasks, +//! handles energy calculations, and provides a flexible framework for different optimization +//! strategies and algorithms. +//! +//! ## Architecture +//! +//! The module is organized into specialized submodules that handle different aspects +//! of the optimization process: +//! +//! - **Configuration** ([`config`]) - Optimization parameters, convergence criteria, and settings +//! - **State Tracking** ([`state`]) - Solution states, optimization progress, and result management +//! - **Progress Monitoring** ([`progress`]) - Progress reporting and user feedback mechanisms +//! - **Error Handling** ([`error`]) - Engine-specific error types and error propagation +//! +//! ## Key Capabilities +//! +//! - **Multi-strategy optimization** supporting various algorithms and convergence criteria +//! - **Parallel computation** for energy calculations and optimization tasks +//! - **Energy caching** to avoid redundant calculations during optimization +//! - **Flexible residue selection** for targeted optimization of specific protein regions +//! - **Progress monitoring** with detailed reporting and convergence tracking +//! - **Transactional modifications** ensuring system consistency during optimization +//! - **Extensible task system** allowing custom optimization algorithms +//! - **Comprehensive error handling** with detailed diagnostic information + pub(crate) mod cache; pub mod config; pub(crate) mod context; From 4505cb199f65153c06da95bae790883f3f5d143d Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:57:11 -0700 Subject: [PATCH 59/83] docs(workflows): Add documentation for side-chain placement workflow and related functions --- crates/scream-core/src/workflows/place.rs | 162 ++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/crates/scream-core/src/workflows/place.rs b/crates/scream-core/src/workflows/place.rs index 26056228..6306f173 100644 --- a/crates/scream-core/src/workflows/place.rs +++ b/crates/scream-core/src/workflows/place.rs @@ -23,12 +23,38 @@ use tracing::{info, instrument}; const CLASH_THRESHOLD_KCAL_MOL: f64 = 25.0; +/// Represents the result of a side-chain placement workflow. +/// +/// This struct contains the initial state of the system and a collection of optimized +/// solutions found during the placement process. #[derive(Debug, Clone)] pub struct PlacementResult { + /// The initial state of the molecular system before optimization. pub initial_state: InitialState, + /// A vector of optimized solutions, sorted by optimization score. pub solutions: Vec, } +/// Executes the complete side-chain placement workflow. +/// +/// This function orchestrates the entire optimization process for protein side-chain +/// conformation prediction, including preparation, energy calculations, clash resolution, +/// optional simulated annealing, and final refinement. +/// +/// # Arguments +/// +/// * `initial_system` - The input molecular system to optimize. +/// * `config` - Configuration parameters for the placement workflow. +/// * `reporter` - Progress reporter for tracking optimization progress. +/// +/// # Return +/// +/// Returns a `PlacementResult` containing the initial state and optimized solutions. +/// +/// # Errors +/// +/// Returns `EngineError` if any step in the workflow fails due to invalid input, +/// configuration issues, or computational errors. #[instrument(skip_all, name = "placement_workflow")] pub fn run( initial_system: &MolecularSystem, @@ -156,6 +182,24 @@ pub fn run( Ok(result) } +/// Prepares the optimization context by resolving active residues from the configuration. +/// +/// This function determines which residues will be optimized based on the user's selection +/// criteria and ensures the rotamer library contains appropriate conformations. +/// +/// # Arguments +/// +/// * `initial_system` - The input molecular system. +/// * `config` - Configuration containing residue selection criteria. +/// * `rotamer_library` - Mutable reference to the rotamer library for potential updates. +/// +/// # Return +/// +/// A set of residue IDs that will be optimized. +/// +/// # Errors +/// +/// Returns `EngineError` if residue selection fails or if selected residues are invalid. fn prepare_context( initial_system: &MolecularSystem, config: &PlacementConfig, @@ -169,6 +213,23 @@ fn prepare_context( Ok(active_residues) } +/// Calculates the initial state of the system including baseline energies. +/// +/// This function computes the constant energy offset and evaluates the energy +/// of the input conformation to establish a baseline for optimization. +/// +/// # Arguments +/// +/// * `context` - The optimization context containing system and forcefield. +/// * `active_residues` - Set of residues being optimized. +/// +/// # Return +/// +/// A tuple containing the initial state and the constant energy offset. +/// +/// # Errors +/// +/// Returns `EngineError` if energy calculations fail. fn calculate_initial_state( context: &OptimizationContext, active_residues: &HashSet, @@ -202,6 +263,25 @@ fn calculate_initial_state( Ok((initial_state, energy_offset_constant)) } +/// Initializes the optimization state by placing ground-state rotamers. +/// +/// This function selects the lowest-energy rotamer for each active residue based on +/// the Empty Lattice energy cache and places them on the system to establish +/// an initial optimization state. +/// +/// # Arguments +/// +/// * `context` - The optimization context. +/// * `active_residues` - Set of residues to optimize. +/// * `el_cache` - Precomputed Empty Lattice energy cache. +/// +/// # Return +/// +/// An initialized `OptimizationState` with ground-state rotamers placed. +/// +/// # Errors +/// +/// Returns `EngineError` if rotamer placement or energy calculation fails. fn initialize_optimization_state( context: &OptimizationContext, active_residues: &HashSet, @@ -254,6 +334,23 @@ fn initialize_optimization_state( )) } +/// Performs iterative clash resolution using doublet optimization. +/// +/// This function iteratively identifies and resolves steric clashes by optimizing +/// pairs of residues simultaneously. It continues until no clashes remain or +/// convergence criteria are met. +/// +/// # Arguments +/// +/// * `state` - Mutable reference to the current optimization state. +/// * `active_residues` - Set of residues being optimized. +/// * `context` - The optimization context. +/// * `el_cache` - Empty Lattice energy cache. +/// * `energy_grid` - Mutable reference to the energy grid for efficient updates. +/// +/// # Errors +/// +/// Returns `EngineError` if clash detection or optimization fails. fn resolve_clashes( state: &mut OptimizationState, active_residues: &HashSet, @@ -383,6 +480,23 @@ fn resolve_clashes( Ok(()) } +/// Runs simulated annealing optimization if configured. +/// +/// This function implements a simulated annealing algorithm to explore the conformational +/// space more thoroughly. It uses temperature-based acceptance criteria to escape local +/// minima and find better global solutions. +/// +/// # Arguments +/// +/// * `state` - Mutable reference to the current optimization state. +/// * `active_residues` - Set of residues being optimized. +/// * `context` - The optimization context. +/// * `el_cache` - Empty Lattice energy cache. +/// * `energy_grid` - Mutable reference to the energy grid. +/// +/// # Errors +/// +/// Returns `EngineError` if simulated annealing configuration is invalid or execution fails. fn run_simulated_annealing( state: &mut OptimizationState, active_residues: &HashSet, @@ -515,6 +629,23 @@ fn run_simulated_annealing( Ok(()) } +/// Performs final refinement through singlet optimization passes. +/// +/// This function runs multiple passes of single-residue optimization to fine-tune +/// the solution. Each pass evaluates all residues and applies the best rotamer +/// change found, continuing until no improvements are made. +/// +/// # Arguments +/// +/// * `state` - Mutable reference to the current optimization state. +/// * `active_residues` - Set of residues being optimized. +/// * `context` - The optimization context. +/// * `el_cache` - Empty Lattice energy cache. +/// * `energy_grid` - Mutable reference to the energy grid. +/// +/// # Errors +/// +/// Returns `EngineError` if refinement operations fail. fn final_refinement( state: &mut OptimizationState, active_residues: &HashSet, @@ -637,6 +768,22 @@ fn final_refinement( Ok(()) } +/// Finalizes and organizes the optimization results. +/// +/// This function collects all solutions found during optimization, sorts them by energy, +/// removes duplicates, and ensures the initial conformation is included if it's among +/// the best solutions. +/// +/// # Arguments +/// +/// * `state` - The final optimization state. +/// * `initial_state` - The initial state before optimization. +/// * `energy_offset` - The constant energy offset to add to all solutions. +/// * `num_solutions` - Maximum number of solutions to return. +/// +/// # Return +/// +/// A `PlacementResult` containing the initial state and sorted solutions. fn finalize_results( state: OptimizationState, initial_state: InitialState, @@ -683,6 +830,21 @@ fn finalize_results( } } +/// Updates the rotamer assignment in the optimization state. +/// +/// This function applies a new rotamer to a residue in the system and updates +/// the state's rotamer tracking map. +/// +/// # Arguments +/// +/// * `state` - Mutable reference to the optimization state. +/// * `res_id` - The residue ID to update. +/// * `new_rot_idx` - The index of the new rotamer to apply. +/// * `context` - The optimization context containing rotamer library and topology. +/// +/// # Errors +/// +/// Returns `EngineError` if rotamer placement fails. fn update_rotamers_in_state( state: &mut OptimizationState, res_id: ResidueId, From d7bd8618d70db41dc64f8652de1ec16c78b8cb94 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 20:57:50 -0700 Subject: [PATCH 60/83] docs(workflows): Add documentation for workflows module with detailed overview and key capabilities --- crates/scream-core/src/workflows/mod.rs | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/crates/scream-core/src/workflows/mod.rs b/crates/scream-core/src/workflows/mod.rs index 66c34dbb..8d8444f1 100644 --- a/crates/scream-core/src/workflows/mod.rs +++ b/crates/scream-core/src/workflows/mod.rs @@ -1 +1,29 @@ +//! # Workflows Module +//! +//! This module provides high-level workflow implementations that orchestrate complete +//! optimization processes for protein side-chain placement in SCREAM++. +//! +//! ## Overview +//! +//! Workflows are the top-level entry points for users of SCREAM++. They encapsulate +//! the entire optimization pipeline, from initial setup through final result generation. +//! Each workflow handles resource loading, parameter validation, progress reporting, +//! and result organization, providing a clean and simple API for complex optimization tasks. +//! +//! ## Architecture +//! +//! The module is organized around specific optimization workflows: +//! +//! - **Placement Workflow** ([`place`]) - Complete side-chain conformation optimization +//! including clash resolution, simulated annealing, and refinement phases. +//! +//! ## Key Capabilities +//! +//! - **End-to-end optimization** from molecular input to optimized conformations +//! - **Resource management** including forcefield, topology, and rotamer library loading +//! - **Progress monitoring** with detailed phase and task reporting +//! - **Result organization** with sorted solutions and energy analysis +//! - **Error handling** with comprehensive diagnostic information +//! - **Flexible configuration** supporting various optimization strategies + pub mod place; From 43784430bb956f98c952c25034aa4348140b0195 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 23:05:17 -0700 Subject: [PATCH 61/83] docs(project): Update module documentation to include architecture overview and module descriptions --- crates/scream-core/src/lib.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/scream-core/src/lib.rs b/crates/scream-core/src/lib.rs index 04614a15..83e221ec 100644 --- a/crates/scream-core/src/lib.rs +++ b/crates/scream-core/src/lib.rs @@ -1,3 +1,18 @@ +//! # SCREAM++ Core Library +//! +//! A modernized, high-performance library for protein side-chain placement and structure redesign. +//! +//! ## Architecture +//! +//! The library is structured into three primary public modules, designed to be used at different +//! levels of abstraction: +//! +//! - [`workflows`]: The highest-level API. +//! +//! - [`engine`]: The configuration and state management layer. +//! +//! - [`core`]: The foundational layer. It contains the fundamental data models, file I/O traits, and lower-level computational tools. + pub mod core; pub mod engine; pub mod workflows; From 27c2f0fa48a9c690dd62989df6920219cb065dae Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sat, 6 Sep 2025 23:05:35 -0700 Subject: [PATCH 62/83] docs(workflows): Clarify documentation for PlacementResult struct by specifying that solutions are sorted by optimization score (best first) --- crates/scream-core/src/workflows/place.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/scream-core/src/workflows/place.rs b/crates/scream-core/src/workflows/place.rs index 6306f173..c269f535 100644 --- a/crates/scream-core/src/workflows/place.rs +++ b/crates/scream-core/src/workflows/place.rs @@ -31,7 +31,7 @@ const CLASH_THRESHOLD_KCAL_MOL: f64 = 25.0; pub struct PlacementResult { /// The initial state of the molecular system before optimization. pub initial_state: InitialState, - /// A vector of optimized solutions, sorted by optimization score. + /// A vector of optimized solutions, sorted by optimization score (best first). pub solutions: Vec, } From e98be7b1e787e689afd983ffda4d8169919c8e5a Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 10:25:22 -0700 Subject: [PATCH 63/83] docs(cli): Create comprehensive user manual for SCREAM++ CLI with setup instructions and command usage examples --- docs/cli/USAGE.md | 461 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 461 insertions(+) create mode 100644 docs/cli/USAGE.md diff --git a/docs/cli/USAGE.md b/docs/cli/USAGE.md new file mode 100644 index 00000000..3bacdb55 --- /dev/null +++ b/docs/cli/USAGE.md @@ -0,0 +1,461 @@ +# SCREAM++ CLI: User Manual + +Welcome to the SCREAM++ command-line interface! This manual will guide you through configuring and using the `scream` CLI for protein side-chain placement and structural optimization. + +## Table of Contents + +- [SCREAM++ CLI: User Manual](#scream-cli-user-manual) + - [Table of Contents](#table-of-contents) + - [Setting Up the Data Directory](#setting-up-the-data-directory) + - [Downloading Data Files](#downloading-data-files) + - [Managing the Data Path](#managing-the-data-path) + - [Core Functionality: Side-Chain Placement (`scream place`)](#core-functionality-side-chain-placement-scream-place) + - [Basic Usage](#basic-usage) + - [Argument Reference](#argument-reference) + - [Core Arguments](#core-arguments) + - [Forcefield Overrides](#forcefield-overrides) + - [Optimization Overrides](#optimization-overrides) + - [General Arguments](#general-arguments) + - [The Configuration File (`config.toml`)](#the-configuration-file-configtoml) + - [Configuration Structure](#configuration-structure) + - [Example Configuration File](#example-configuration-file) + - [Detailed Configuration Options](#detailed-configuration-options) + - [The `[forcefield]` Table](#the-forcefield-table) + - [The `[sampling]` Table](#the-sampling-table) + - [The `[optimization]` Table](#the-optimization-table) + - [The `[residues-to-optimize]` Table](#the-residues-to-optimize-table) + - [Practical Examples (Use Cases)](#practical-examples-use-cases) + - [Example 1: Simple Global Optimization](#example-1-simple-global-optimization) + - [Example 2: Using a Specific Rotamer Library and s-factor](#example-2-using-a-specific-rotamer-library-and-s-factor) + - [Example 3: Optimizing a Ligand Binding Pocket](#example-3-optimizing-a-ligand-binding-pocket) + - [Example 4: Generating Multiple Solutions with Templated Naming](#example-4-generating-multiple-solutions-with-templated-naming) + - [Configuration Reference Table](#configuration-reference-table) + +--- + +## Setting Up the Data Directory + +SCREAM++ relies on a set of data files, including forcefield parameters, rotamer libraries, and topology definitions. Before you can use the `place` command, you must download this data. + +### Downloading Data Files + +This is the **mandatory first step** for using SCREAM++. Execute the following command to download and automatically unpack all required data files to their default location: + +```sh +scream data download +``` + +This command fetches the same versioned data bundle from the official GitHub Releases page. If you have downloaded the data before but wish to force an update, use the `--force` flag: + +```sh +scream data download --force +``` + +### Managing the Data Path + +By default, data files are stored in your operating system's standard user data directory (e.g., `~/.local/share/screampp` on Linux). + +- **View the current data path**: + + ```sh + scream data path + ``` + +- **Set a custom data path**: + + If you wish to store the data files in a different location (e.g., a project directory or a shared server path), use `set-path`: + + ```sh + scream data set-path /path/to/my/custom/data/directory + ``` + + After setting this, future `download` and `place` commands will use this new path. + +- **Reset to the default path**: + + To revert to the default data storage location, run: + + ```sh + scream data reset-path + ``` + +--- + +## Core Functionality: Side-Chain Placement (`scream place`) + +The `place` command is the heart of SCREAM++, used to perform side-chain optimization on an input protein structure. + +### Basic Usage + +The simplest usage involves providing an input file and an output file path: + +```sh +scream place -i input_protein.bgf -o optimized_protein.bgf +``` + +This command will optimize all side-chains in `input_protein.bgf` using built-in default parameters and save the lowest-energy structure to `optimized_protein.bgf`. + +### Argument Reference + +You can precisely control the optimization process via command-line arguments or a configuration file. Command-line arguments will always override settings in a configuration file. + +#### Core Arguments + +- `-i, --input ` (**Required**): + Specifies the path to the input molecular structure file. Currently, only BGF format is supported. + +- `-o, --output ` (**Required**): + Specifies the path for the output file(s). This is a powerful template that allows for dynamic file naming based on the optimization results. Supported placeholders include: + + - `{i}` or `{n}`: The rank of the solution (1 is best). + - `{total}` or `{N}`: The total number of solutions generated. + - `{energy}` or `{total_energy}`: The total energy of the solution (kcal/mol). + - `{score}` or `{opt_score}`: The internal optimization score of the solution. + - _Example_: `-o "solution_{i}_of_{total}_E_{energy}.bgf"` + +- `-c, --config `: + Specifies the path to a configuration file in TOML format. This is the recommended method for complex setups (e.g., selecting specific residues). + +#### Forcefield Overrides + +These arguments allow you to quickly override forcefield settings from the configuration file. + +- `-s, --s-factor `: + Sets the `s-factor` for the flat-bottom potential. This value influences the energy function's tolerance for minor atomic clashes. The default is typically `1.1`. + +- `--forcefield-path `: + Specifies the forcefield parameter file. Can be a local file path or a logical name (e.g., `'lj-12-6@0.4'`). + +- `--delta-params-path `: + Specifies the `delta` parameter file for the flat-bottom potential correction. Can be a local file path or a logical name (e.g., `'rmsd-1.0'`). + +- `-t, --topology-registry `: + Specifies the residue topology registry file. The default `'default'` is usually sufficient. + +#### Optimization Overrides + +- `-l, --rotamer-library `: + Specifies the rotamer library to use. This is a critical parameter as it defines the conformational space for side-chain sampling. Can be a local file path or a logical name (e.g., `'charmm@rmsd-1.0'`). Available charge schemes include `amber`, `charmm`, `qeq`, `amber-n`, `charmm-n`, and `qeq-n`. Available resolutions (diversity) include `rmsd-0.1` to `rmsd-5.0` (0.1 increments) and `all-torsion`. + +- `-n, --num-solutions `: + Specifies the number of lowest-energy solutions to generate and save. Defaults to `1`. + +- `--max-iterations `: + Sets the maximum number of iterations for the clash resolution phase. + +- `--with-input-conformation` / `--no-input-conformation`: + Force the inclusion or exclusion of the original side-chain conformation from the input structure as a candidate. + +- `--no-refinement`: + Disables the final singlet optimization (refinement) stage. This can speed up calculations but may slightly reduce accuracy. + +- `--no-annealing`: + Disables the simulated annealing process, even if it is enabled in the configuration file. + +- `-S, --set `: + Directly set a configuration value from the command line for quick overrides. Example: `-S optimization.max-iterations=200`. + +#### General Arguments + +- `-v, --verbose`: Increase the log verbosity. `-v` (INFO), `-vv` (DEBUG), `-vvv` (TRACE). +- `-q, --quiet`: Suppress all log output except for errors. +- `--log-file `: Write logs to a specified file in addition to console output. +- `-j, --threads `: Set the number of threads for parallel computation. Defaults to all available CPU cores. (If you are using a HPC with a multi-threaded job, please ignore this and let the system manage CPU usage - SCREAM++ will automatically detect the number of available cores available to your job.) + +--- + +## The Configuration File (`config.toml`) + +For complex or reproducible tasks, using a TOML configuration file is highly recommended. + +### Configuration Structure + +The configuration file is organized into four main sections (TOML tables): + +1. `[forcefield]`: Parameters related to the energy function. +2. `[sampling]`: Parameters related to conformational sampling. +3. `[optimization]`: Parameters to control the optimization algorithm. +4. `[residues-to-optimize]`: Defines the scope of the optimization. + +### Example Configuration File + +Here is an example configuration file with all common options and detailed comments. You can copy this and modify it to suit your needs. + +```toml +# ============================================================================= +# SCREAM++ Example Configuration File +# ============================================================================= + +# ----------------------------------------------------------------------------- +# [forcefield] - Energy Function & Parameters +# ----------------------------------------------------------------------------- +[forcefield] + +# -- Core Parameters -- + +# Path or logical name for the main forcefield parameter file. +# Logical names: 'exp-6@0.4', 'lj-12-6@0.4', etc. +# Default: "exp-6@0.4" +forcefield-path = "exp-6@0.4" + +# Path or logical name for the flat-bottom delta parameters. +# The diversity (e.g., "rmsd-1.0") should match your rotamer library. +# Default: "rmsd-1.0" +delta-params-path = "rmsd-1.0" + +# The 's-factor' for the flat-bottom potential. This is a critical parameter +# that tunes the tolerance for atomic clashes. +# Default: 1.1 +s-factor = 1.1 + +# -- [Optional] Advanced Energy Weighting -- +# +# This section allows you to scale energy terms for interactions between +# different types of atoms (Atom Roles: Backbone, Sidechain, Ligand, Water, Other). +# By default, all weights are 1.0. +# +# [[forcefield.energy-weights.rules]] +# groups = ["Backbone", "Sidechain"] +# weights = { vdw = 0.8, coulomb = 0.8, hbond = 1.0 } +# +# [[forcefield.energy-weights.rules]] +# groups = ["Sidechain", "Ligand"] +# weights = { vdw = 0.5, coulomb = 1.0, hbond = 1.2 } + + +# ----------------------------------------------------------------------------- +# [sampling] - Conformational Sampling +# ----------------------------------------------------------------------------- +[sampling] + +# Path or logical name for the rotamer library. +# The diversity (e.g., "rmsd-1.0") should match your delta-params-path. +# Logical names: 'charmm@rmsd-1.0', 'amber@rmsd-1.0', etc. +# Default: "charmm@rmsd-1.0" +rotamer-library = "charmm@rmsd-1.0" + + +# ----------------------------------------------------------------------------- +# [optimization] - Algorithm Control +# ----------------------------------------------------------------------------- +[optimization] + +# Number of lowest-energy, unique solutions to generate and save. +# Default: 1 +num-solutions = 1 + +# Maximum number of iterations for the primary clash-resolution loop. +# Default: 100 +max-iterations = 100 + +# If true, the original side-chain conformation from the input structure +# will be included as a candidate during the optimization. +# Default: true +include-input-conformation = true + +# Number of refinement iterations (singlet optimization) to perform after the +# main clash-resolution loop has converged. Set to 0 to disable. +# Default: 2 +final-refinement-iterations = 2 + +# -- [Optional] Simulated Annealing -- +# +# To enable simulated annealing for better global energy landscape exploration, +# uncomment this entire section. This may improve results but will increase runtime. +# +# [optimization.simulated-annealing] +# initial-temperature = 5.0 # Starting temperature (in energy units). +# final-temperature = 0.1 # Temperature at which to stop the annealing. +# cooling-rate = 0.9 # Multiplicative factor to decrease temperature (e.g., T_new = T_old * 0.9). +# steps-per-temperature = 100 # Number of Monte Carlo moves to attempt at each temperature step. + +# -- Convergence Criteria -- +# +# Defines the conditions for stopping the clash-resolution loop. +# +[optimization.convergence] +# The loop will stop if the best energy found does not improve by at least +# this amount (in kcal/mol) over a 'patience' number of iterations. +# Default: 0.01 +energy-threshold = 0.01 + +# The number of consecutive iterations without sufficient energy improvement +# before the algorithm is considered to have converged. +# Default: 5 +patience-iterations = 5 + + +# ----------------------------------------------------------------------------- +# [residues-to-optimize] - Defines the Scope of Optimization +# ----------------------------------------------------------------------------- +[residues-to-optimize] + +# TYPE 1: Optimize all residues in the protein. +type = "all" + +# TYPE 2: Optimize a specific list of residues. +# type = "list" +# # 'include' defines a whitelist. If 'include' is empty, all residues are selected. +# include = [ +# { chain-id = 'A', residue-number = 25 }, +# { chain-id = 'A', residue-number = 101 }, +# ] +# # 'exclude' defines a blacklist that overrides the selection. +# exclude = [ +# { chain-id = 'A', residue-number = 50 }, +# ] + +# TYPE 3: Optimize residues within a radius of a ligand. +# type = "ligand-binding-site" +# # Specify the ligand's location. +# [residues-to-optimize.ligand-residue] +# chain-id = 'X' +# residue-number = 999 +# # Define the radius in Angstroms from any heavy atom of the ligand. +# radius-angstroms = 5.0 +``` + +### Detailed Configuration Options + +#### The `[forcefield]` Table + +- `forcefield-path`: Specifies forcefield parameters. Logical name format: `@`. +- `delta-params-path`: Specifies `delta` parameters. Logical name format: `rmsd-` or `all-torsion`. The `` should match the rotamer library's `diversity`. +- `s-factor`: A key parameter. It is strongly recommended to use a value consistent with `delta-params-path` and `rotamer-library`. +- `energy-weights`: An advanced option. Allows you to assign different weights to interactions between different atom roles (e.g., Backbone-Sidechain). + +#### The `[sampling]` Table + +- `rotamer-library`: Specifies the rotamer library. Logical name format: `@`, e.g., `charmm@rmsd-1.0`. `scheme` can be `charmm`, `amber`, etc. The `diversity` should match `delta-params-path`. + +#### The `[optimization]` Table + +- `simulated-annealing`: If this section is present, simulated annealing will be enabled. This helps escape local energy minima but increases computation time. +- `convergence`: Controls when the iterative algorithm stops. + +#### The `[residues-to-optimize]` Table + +This is the core section for defining the scope of your optimization. + +- `type = "all"`: The simplest case; all residues are optimized. +- `type = "list"`: + - `include`: Defines a whitelist of residues to optimize. If `include` is empty, all residues are selected by default. + - `exclude`: Defines a blacklist. These residues will **not** be optimized, even if they are in the `include` list. +- `type = "ligand-binding-site"`: + - `ligand-residue`: Specifies the ligand residue. + - `radius-angstroms`: Defines a spherical region. Any protein residue with a heavy atom (non-hydrogen) inside this sphere will be selected for optimization. + +--- + +## Practical Examples (Use Cases) + +Assume we have an input file `protein.bgf`. + +### Example 1: Simple Global Optimization + +**Goal**: Quickly optimize all side-chains in the protein and save the best result. + +```sh +scream place -i protein.bgf -o protein_optimized.bgf +``` + +> This uses all default parameters and is ideal for a quick initial assessment. + +### Example 2: Using a Specific Rotamer Library and s-factor + +**Goal**: Perform optimization using the CHARMM charge scheme and a finer rotamer library (`rmsd-0.6`). + +```sh +scream place \ + -i protein.bgf \ + -o protein_charmm_0.6.bgf \ + -l charmm@rmsd-0.6 \ + --delta-params-path rmsd-0.6 \ + -s 1.2 +``` + +> **Note**: The `diversity` (`rmsd-0.6`) of the `rotamer-library` and `delta-params-path` must match. The `s-factor` (`1.2`) is a recommended value optimized for this diversity. + +### Example 3: Optimizing a Ligand Binding Pocket + +**Goal**: The input file `complex.bgf` contains a protein and a ligand (chain X, residue number 999). We want to optimize only the protein side-chains within 5 Å of the ligand. + +First, create a configuration file `pocket_opt.toml`: + +```toml +# pocket_opt.toml +[residues-to-optimize] +type = "ligand-binding-site" +radius-angstroms = 5.0 + +[residues-to-optimize.ligand-residue] +chain-id = 'X' +residue-number = 999 +``` + +Then, run the command: + +```sh +scream place -i complex.bgf -o complex_pocket_optimized.bgf -c pocket_opt.toml +``` + +> **Note**: Using energy weights is recommended for binding site optimization, as it allows advanced control and fine-tuning. For example, you can increase the strength of sidechain-ligand hydrogen bonds or reduce the van der Waals interactions between sidechains and backbone to achieve specific optimization goals. + +### Example 4: Generating Multiple Solutions with Templated Naming + +**Goal**: We are uncertain which conformation is best and want to generate the top 3 lowest-energy solutions, naming the output files based on their rank and energy. + +```sh +scream place \ + -i protein.bgf \ + -o "protein_sol_{i}_E_{energy}.bgf" \ + -n 3 +``` + +This will generate files with names like: + +- `protein_sol_1_E_-1234.56.bgf` (The lowest energy solution) +- `protein_sol_2_E_-1232.10.bgf` +- `protein_sol_3_E_-1230.05.bgf` + +--- + +## Configuration Reference Table + +This table provides a comprehensive mapping between the `scream place` command-line arguments and their equivalent settings in the `config.toml` file. Use it as a quick reference for all available configuration options. Command-line arguments always take precedence over the configuration file. + +| CLI Argument (Short) | CLI Argument (Long) | `config.toml` Key | Value Type | Default | Description | +| :------------------- | :-------------------------- | :------------------------------------------------------- | :-------------------- | :---------------- | :----------------------------------------------------------------------------------------- | +| `-i` | `--input` | _(N/A)_ | File Path | **Required** | Path to the input molecular structure file (.bgf). | +| `-o` | `--output` | _(N/A)_ | File Path Template | **Required** | Path for the output file(s), supports templating. | +| `-c` | `--config` | _(N/A)_ | File Path | (None) | Path to the main TOML configuration file. | +| **---** | **---** | **Forcefield Settings** | **---** | **---** | **---** | +| `-s` | `--s-factor` | `forcefield.s-factor` | Float | `1.1` | The scaling factor (`s`) for the flat-bottom potential. | +| | `--forcefield-path` | `forcefield.forcefield-path` | String (Path/Logical) | `exp-6@0.4` | Path or logical name for the forcefield parameters. | +| | `--delta-params-path` | `forcefield.delta-params-path` | String (Path/Logical) | `rmsd-1.0` | Path or logical name for the flat-bottom delta parameters. | +| `-t` | `--topology-registry` | `topology-registry-path` | String (Path/Logical) | `default` | Path or logical name for the residue topology registry. | +| **---** | **---** | **Sampling & Optimization** | **---** | **---** | **---** | +| `-l` | `--rotamer-library` | `sampling.rotamer-library` | String (Path/Logical) | `charmm@rmsd-1.0` | Path or logical name for the rotamer library. | +| `-n` | `--num-solutions` | `optimization.num-solutions` | Integer | `1` | The number of top solutions to generate and save. | +| | `--max-iterations` | `optimization.max-iterations` | Integer | `100` | Maximum number of iterations for the clash resolution phase. | +| | `--with-input-conformation` | `optimization.include-input-conformation` | Boolean Flag | `true` | Forces **inclusion** of the input conformation as a candidate. | +| | `--no-input-conformation` | `optimization.include-input-conformation` | Boolean Flag | `true` | Forces **exclusion** of the input conformation. | +| | `--no-refinement` | `optimization.final-refinement-iterations` | Boolean Flag | `2` | Disables the final refinement stage (sets iterations to 0). | +| | `--no-annealing` | `optimization.simulated-annealing` | Boolean Flag | (Disabled) | Disables simulated annealing, even if set in the config file. | +| **---** | **---** | **Convergence Settings** | **---** | **---** | **---** | +| | _(N/A)_ | `optimization.convergence.energy-threshold` | Float | `0.01` | Stop if energy improvement is less than this value (kcal/mol). | +| | _(N/A)_ | `optimization.convergence.patience-iterations` | Integer | `5` | Number of iterations without improvement before stopping. | +| **---** | **---** | **Simulated Annealing** | **---** | **---** | **---** | +| | _(N/A)_ | `optimization.simulated-annealing.initial-temperature` | Float | (N/A) | Starting temperature for the annealing schedule. | +| | _(N/A)_ | `optimization.simulated-annealing.final-temperature` | Float | (N/A) | Ending temperature for the annealing schedule. | +| | _(N/A)_ | `optimization.simulated-annealing.cooling-rate` | Float (0-1) | (N/A) | Multiplicative factor for decreasing temperature. | +| | _(N/A)_ | `optimization.simulated-annealing.steps-per-temperature` | Integer | (N/A) | Number of moves to attempt at each temperature step. | +| **---** | **---** | **Residue Selection** | **---** | **---** | **---** | +| | _(N/A)_ | `residues-to-optimize` | Table | `type = "all"` | Defines which residues to optimize. See manual for `list` and `ligand-binding-site` types. | +| **---** | **---** | **Advanced Overrides** | **---** | **---** | **---** | +| `-S` | `--set` | _(Various)_ | String (`KEY=VALUE`) | (None) | Overrides a specific config value directly. E.g., `-S optimization.num-solutions=5`. | +| **---** | **---** | **General Settings** | **---** | **---** | **---** | +| `-j` | `--threads` | _(N/A)_ | Integer | (CPU Cores) | Number of threads for parallel computation. | +| `-v` | `--verbose` | _(N/A)_ | Count Flag | (Off) | Increase verbosity (`-v`, `-vv`, `-vvv`). | +| `-q` | `--quiet` | _(N/A)_ | Boolean Flag | (Off) | Suppress all output except errors. | +| | `--log-file` | _(N/A)_ | File Path | (None) | Path to write a detailed log file. | From debb418ed86413baefe8f221bd1eb6a37d2bb890 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 10:30:23 -0700 Subject: [PATCH 64/83] docs(cli): Add complete configuration file template for SCREAM++ CLI --- docs/cli/config.template.toml | 129 ++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 docs/cli/config.template.toml diff --git a/docs/cli/config.template.toml b/docs/cli/config.template.toml new file mode 100644 index 00000000..6f8ba332 --- /dev/null +++ b/docs/cli/config.template.toml @@ -0,0 +1,129 @@ +# ============================================================================= +# SCREAM++ Complete Configuration File Template +# ============================================================================= +# +# This file contains ALL available settings for a `scream place` run. +# - To use a setting, uncomment it and modify its value. +# - Settings that are commented out will use the application's default value, +# which is shown in the comment for reference. +# - For detailed explanations, please refer to the full user manual. + +# ----------------------------------------------------------------------------- +# [forcefield] - Energy function and forcefield parameters +# ----------------------------------------------------------------------------- +[forcefield] + +# s-factor for the flat-bottom potential. This is a critical parameter for accuracy. +# Its optimal value is dependent on the diversity of the chosen rotamer library. +# Default: 1.1 (optimized for rmsd-1.0 libraries) +s-factor = 1.1 + +# Path or logical name for the main forcefield parameter file. +# Logical names: 'exp-6@0.4', 'lj-12-6@0.4', etc. +# Default: "exp-6@0.4" +# forcefield-path = "exp-6@0.4" + +# Path or logical name for the flat-bottom delta parameter file. +# The diversity (e.g., "rmsd-1.0") should match the rotamer library. +# Default: "rmsd-1.0" +# delta-params-path = "rmsd-1.0" + +# [Optional] Rules for applying custom weights to energy components between +# different types of atoms (Atom Roles: Backbone, Sidechain, Ligand, Water, Other). +# By default, all interactions have a weight of 1.0. +# [[forcefield.energy-weights.rules]] +# groups = ["Backbone", "Sidechain"] +# weights = { vdw = 1.0, coulomb = 1.0, hbond = 1.0 } + +# ----------------------------------------------------------------------------- +# [sampling] - Side-chain conformation sampling +# ----------------------------------------------------------------------------- +[sampling] + +# Path or logical name for the rotamer library. +# The diversity (e.g., "rmsd-1.0") should match `delta-params-path`. +# Logical names: 'charmm@rmsd-1.0', 'amber@rmsd-1.0', etc. +# Default: "charmm@rmsd-1.0" +# rotamer-library = "charmm@rmsd-1.0" + +# ----------------------------------------------------------------------------- +# [optimization] - Algorithm control +# ----------------------------------------------------------------------------- +[optimization] + +# The number of lowest-energy, unique solutions to generate and save. +# Default: 1 +num-solutions = 1 + +# Maximum number of iterations for the main clash resolution algorithm. +# Default: 100 +# max-iterations = 100 + +# Whether to include the input structure's original side-chain conformation as +# a candidate solution during the optimization. +# Default: true +# include-input-conformation = true + +# Number of refinement iterations (singlet optimization) to run after the +# main loop converges. Set to 0 to disable. +# Default: 2 +# final-refinement-iterations = 2 + +# [optimization.convergence] +# --- Convergence Criteria --- +# The algorithm is considered converged if the best energy improves by less than +# this threshold over a 'patience' number of iterations. +# Default: 0.01 (kcal/mol) +# energy-threshold = 0.01 + +# The number of consecutive iterations with insufficient energy improvement +# before the optimization loop terminates. +# Default: 5 +# patience-iterations = 5 + +# [optimization.simulated-annealing] +# --- [Optional] Simulated Annealing --- +# To enable, uncomment this entire section. This can help the algorithm escape +# local energy minima but will increase runtime. +# initial-temperature = 5.0 +# final-temperature = 0.1 +# cooling-rate = 0.9 +# steps-per-temperature = 100 + +# ----------------------------------------------------------------------------- +# [residues-to-optimize] - Defines which residues to modify +# ----------------------------------------------------------------------------- +[residues-to-optimize] + +# Choose ONE of the following types: "all", "list", or "ligand-binding-site". + +# TYPE 1: Optimize all residues. +# This is the default if the section is omitted. +type = "all" + +# TYPE 2: Optimize a specific list of residues. +# `include` specifies which residues to target. If `include` is empty, it defaults to all residues. +# `exclude` specifies which residues to ignore, even if they are in the `include` selection. +# type = "list" +# include = [ +# { chain-id = 'A', residue-number = 25 }, +# { chain-id = 'A', residue-number = 101 }, +# ] +# exclude = [] + +# TYPE 3: Optimize residues within a certain radius of a ligand. +# The radius is measured from any heavy atom of the ligand to any heavy atom of a protein residue. +# type = "ligand-binding-site" +# radius-angstroms = 5.0 +# [residues-to-optimize.ligand-residue] +# chain-id = 'X' +# residue-number = 999 + +# ----------------------------------------------------------------------------- +# Global Settings +# ----------------------------------------------------------------------------- + +# Path or logical name for the residue topology registry. +# In most cases, the default is sufficient. +# Default: "default" +# topology-registry-path = "default" From f28444733639a37c8c269b7420e73038999f44f9 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 11:23:26 -0700 Subject: [PATCH 65/83] ci(project): Include CLI usage documentation files in release artifacts --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b7f449d9..f83503fc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -69,7 +69,7 @@ jobs: SOURCE_PATH="target/${{ matrix.target }}/release/$BINARY_NAME" cp "$SOURCE_PATH" "$ARTIFACT_DIR/$BINARY_NAME" - cp LICENSE README.md "$ARTIFACT_DIR/" + cp LICENSE README.md docs/cli/USAGE.md docs/cli/config.template.toml "$ARTIFACT_DIR/" cd $ARTIFACT_DIR From 10014f3befcd72b6e52d0845714a6dbd109e0873 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 18:05:48 -0700 Subject: [PATCH 66/83] docs(core): Enhance documentation for Residue struct to clarify caching mechanism and usage --- crates/scream-core/src/core/models/residue.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/scream-core/src/core/models/residue.rs b/crates/scream-core/src/core/models/residue.rs index 88aa9bb3..d7c3cbc0 100644 --- a/crates/scream-core/src/core/models/residue.rs +++ b/crates/scream-core/src/core/models/residue.rs @@ -198,9 +198,10 @@ impl ResidueType { /// Represents a residue in a molecular structure. /// -/// This struct encapsulates the properties and atoms of a single residue, -/// providing efficient access to backbone and sidechain atoms through caching. -/// It is used in protein modeling and side-chain placement algorithms. +/// This struct encapsulates the properties and atoms of a single residue. +/// It uses lazy caching for backbone and sidechain atom lookups to improve +/// performance in frequently accessed operations. The cache is automatically +/// invalidated when atoms are added or removed. #[derive(Debug, Clone, PartialEq)] pub struct Residue { /// The sequential number of the residue in its chain. From 4f195e03ce052e7fb21c3bc0f4d012e4b718ead0 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 19:17:03 -0700 Subject: [PATCH 67/83] docs(project): Add foundational architecture and core data models documentation for the scream-core library --- docs/cli/USAGE.md | 8 +- .../core/01_architecture_and_data_models.md | 132 ++++++++++++++++++ 2 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 docs/dev/core/01_architecture_and_data_models.md diff --git a/docs/cli/USAGE.md b/docs/cli/USAGE.md index 3bacdb55..0b1c839c 100644 --- a/docs/cli/USAGE.md +++ b/docs/cli/USAGE.md @@ -172,10 +172,10 @@ For complex or reproducible tasks, using a TOML configuration file is highly rec The configuration file is organized into four main sections (TOML tables): -1. `[forcefield]`: Parameters related to the energy function. -2. `[sampling]`: Parameters related to conformational sampling. -3. `[optimization]`: Parameters to control the optimization algorithm. -4. `[residues-to-optimize]`: Defines the scope of the optimization. +1. `[forcefield]`: Parameters related to the energy function. +2. `[sampling]`: Parameters related to conformational sampling. +3. `[optimization]`: Parameters to control the optimization algorithm. +4. `[residues-to-optimize]`: Defines the scope of the optimization. ### Example Configuration File diff --git a/docs/dev/core/01_architecture_and_data_models.md b/docs/dev/core/01_architecture_and_data_models.md new file mode 100644 index 00000000..bd7a8fa9 --- /dev/null +++ b/docs/dev/core/01_architecture_and_data_models.md @@ -0,0 +1,132 @@ +# 1. Core Library: Architecture and Data Models + +This document outlines the foundational architecture and the core data models of the `scream-core` library. Understanding these concepts is essential for navigating, using, and contributing to the scientific core of SCREAM++. + +**Table of Contents** + +- [1. Core Library: Architecture and Data Models](#1-core-library-architecture-and-data-models) + - [1. Architectural Philosophy: The Three-Layer Pyramid](#1-architectural-philosophy-the-three-layer-pyramid) + - [1.1. The `core` Layer: The Foundation](#11-the-core-layer-the-foundation) + - [1.2. The `engine` Layer: The Logic Core](#12-the-engine-layer-the-logic-core) + - [1.3. The `workflows` Layer: The Public API](#13-the-workflows-layer-the-public-api) + - [2. Core Data Models: The Representation of a Molecular World](#2-core-data-models-the-representation-of-a-molecular-world) + - [2.1. `MolecularSystem`: The Central Data Store](#21-molecularsystem-the-central-data-store) + - [2.2. Identifier Stability: The Role of `slotmap`](#22-identifier-stability-the-role-of-slotmap) + - [2.3. Structural Knowledge Representation](#23-structural-knowledge-representation) + +--- + +## 1. Architectural Philosophy: The Three-Layer Pyramid + +The `scream-core` library is designed with a clear, hierarchical three-layer architecture. This separation of concerns ensures that the library is modular, highly testable, and easy to extend. Each layer has a distinct responsibility, building upon the one below it. + +**Figure 1: The Three-Layer Architecture** + +```mermaid +graph TD + subgraph screaming-core + W[workflows Layer] --> E[engine Layer] + E --> C[core Layer] + end + + subgraph Responsibilities + W_Desc("Public API
End-to-end scientific procedures.
e.g., place::run()") + E_Desc("State Machine & Algorithms
Manages the optimization process.
e.g., EnergyGrid, OptimizationState") + C_Desc("Foundation & Toolbox
Stateless data structures and pure computations.
e.g., MolecularSystem, Scorer") + end + + W -.-> W_Desc + E -.-> E_Desc + C -.-> C_Desc +``` + +### 1.1. The `core` Layer: The Foundation + +This is the bedrock of the library. It contains stateless data structures and pure computational logic. + +- **Responsibility**: To define **what** a molecular system is and **how** to perform fundamental calculations on it. +- **Key Modules**: + - `models`: Defines the data representation for `MolecularSystem`, `Atom`, `Residue`, `Chain`, and their relationships. It is the single source of truth for structural data. + - `forcefield`: Implements the mathematical formulas for potential energy functions (`potentials`), defines parameter structures (`params`), and provides the `Scorer` for calculating energy between atoms. It does not know anything about optimization. + - `topology` & `rotamers`: Define the structural "knowledge base" of the system, such as which atoms constitute a sidechain (`TopologyRegistry`) and the possible discrete conformations (`RotamerLibrary`). + - `io`: Handles the serialization and deserialization of the `models` to and from standard file formats like BGF. + +### 1.2. The `engine` Layer: The Logic Core + +The engine is the stateful "brain" of the optimization process. It takes the tools from the `core` layer and orchestrates them according to a specific algorithm. + +- **Responsibility**: To manage the **state** and **logic** of an optimization workflow. +- **Key Components**: + - `config`: Defines all parameters for a computational task, such as `PlacementConfig`. + - `context`: A lightweight, read-only "view" that provides the engine's tasks with access to the `system`, `forcefield`, and `config`. + - `state`: Tracks the progress of an optimization, including the best solutions found so far (`OptimizationState`). + - `cache` & `energy_grid`: High-performance data structures designed to prevent redundant calculations and enable efficient incremental energy updates. + - `transaction`: Implements the `SystemView` model, which allows for temporary, reversible modifications to the `MolecularSystem` for "what-if" energy calculations without expensive cloning. + - `tasks`: Contains the discrete steps of an algorithm, such as `clash_detection` or `el_energy` calculation. + +### 1.3. The `workflows` Layer: The Public API + +This is the highest-level, user-facing layer. It ties the `engine` and `core` together to execute a complete, end-to-end scientific procedure. + +- **Responsibility**: To provide simple, powerful entry points for common scientific tasks. +- **Example**: The `workflows::place::run` function is the primary entry point for side-chain placement. It takes a `MolecularSystem` and a `PlacementConfig`, orchestrates the entire sequence of `engine` tasks, and returns a final, easy-to-use `PlacementResult`. + +## 2. Core Data Models: The Representation of a Molecular World + +The data models in `scream-core::core::models` are designed to be both comprehensive and efficient. + +### 2.1. `MolecularSystem`: The Central Data Store + +The `MolecularSystem` is the central struct that owns all structural data. Its design follows a relational database model, where entities (`Chain`, `Residue`, `Atom`) are stored in their own collections and linked by stable identifiers. + +**Figure 2: Entity Relationship Diagram of `MolecularSystem`** + +```mermaid +erDiagram + CHAIN { + ChainId id PK "Stable primary key" + char chain_char "e.g., 'A'" + ChainType type "e.g., Protein, Ligand" + } + RESIDUE { + ResidueId id PK "Stable primary key" + isize residue_number "Sequence number" + string name "e.g., 'ALA'" + ChainId chain_id FK "Foreign key to CHAIN" + } + ATOM { + AtomId id PK "Stable primary key" + string name "e.g., 'CA'" + Point3_f64_ position "3D coordinates" + ResidueId residue_id FK "Foreign key to RESIDUE" + } + BOND { + AtomId atom1_id PK,FK "Compound primary key" + AtomId atom2_id PK,FK "and foreign key to ATOM" + BondOrder order "e.g., Single, Double" + } + + CHAIN ||--|{ RESIDUE : "contains" + RESIDUE ||--|{ ATOM : "contains" + ATOM }o--o{ ATOM : "bonded_via (symmetric)" +``` + +- **`CHAIN`**: Contains a list of `ResidueId`s. +- **`RESIDUE`**: Belongs to one `CHAIN` and contains a list of `AtomId`s. +- **`ATOM`**: Belongs to one `RESIDUE`. +- **`BOND`**: Represents a connection between two `ATOM` entities. Stored separately in a flat list within `MolecularSystem`. + +### 2.2. Identifier Stability: The Role of `slotmap` + +A critical design choice is the use of the `slotmap` crate for generating identifiers (`AtomId`, `ResidueId`, `ChainId`). + +- **The Problem**: If we used simple `Vec` and referred to atoms by their index (`usize`), removing an atom from the middle of the vector would shift all subsequent indices, invalidating any stored references. This is a common source of bugs ("dangling pointers" or "off-by-one errors"). +- **The Solution**: `slotmap` provides generational identifiers. An `AtomId` remains valid for the entire lifetime of an atom. If the atom is deleted, its "slot" can be reused later, but a new, different `AtomId` will be generated. This ensures that an old, invalid ID will never accidentally point to a new atom. This makes operations like removing side-chains during placement robust and safe. + +### 2.3. Structural Knowledge Representation + +Beyond the core system, two key structures encode the "rules" of protein chemistry: + +- **`TopologyRegistry` (`core::topology::registry`)**: This acts as a dictionary that defines the "blueprint" for each type of amino acid. For a given residue name (e.g., "ALA"), it specifies which atom names belong to the conserved **backbone** (`anchor_atoms`) and which belong to the variable **sidechain** (`sidechain_atoms`). This is fundamental for nearly all operations, from coloring atoms in a viewer to separating fixed vs. mobile parts during optimization. + +- **`RotamerLibrary` (`core::rotamers::library`)**: This is the discrete library of pre-calculated, low-energy side-chain conformations. It is the key simplification that makes the side-chain placement problem computationally tractable. Each entry maps a `ResidueType` to a list of possible `Rotamer` structures. The `engine` uses this library to sample different conformations during the optimization search. From 9567b9686d52b01559be6c315fd9d815c02de799 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 21:04:34 -0700 Subject: [PATCH 68/83] docs(project): Add detailed documentation for forcefield and energy calculation in SCREAM++ --- docs/dev/core/02_energy_calculation.md | 145 ++ docs/dev/core/images/flat_bottom_vdw.svg | 2143 ++++++++++++++++++++++ 2 files changed, 2288 insertions(+) create mode 100644 docs/dev/core/02_energy_calculation.md create mode 100644 docs/dev/core/images/flat_bottom_vdw.svg diff --git a/docs/dev/core/02_energy_calculation.md b/docs/dev/core/02_energy_calculation.md new file mode 100644 index 00000000..4d8f2f74 --- /dev/null +++ b/docs/dev/core/02_energy_calculation.md @@ -0,0 +1,145 @@ +# 2. Forcefield and Energy Calculation + +This document details the theoretical foundation and algorithmic implementation of energy calculation in SCREAM++. It connects the underlying physics to the specific energy terms used throughout the optimization engine. + +**Table of Contents** + +- [2. Forcefield and Energy Calculation](#2-forcefield-and-energy-calculation) + - [2.1. The Physical Model: Pairwise Non-Bonded Energy](#21-the-physical-model-pairwise-non-bonded-energy) + - [2.1.1. The Fundamental Equation](#211-the-fundamental-equation) + - [2.1.2. The Five Atom Sets (Conceptual)](#212-the-five-atom-sets-conceptual) + - [2.2. The Algorithmic Model: Decomposing for Efficiency](#22-the-algorithmic-model-decomposing-for-efficiency) + - [2.2.1. From Five Sets to Two: Active vs. Fixed](#221-from-five-sets-to-two-active-vs-fixed) + - [2.2.2. The Core Decomposition of Total Energy](#222-the-core-decomposition-of-total-energy) + - [2.2.3. Mapping Decomposition to SCREAM++ Concepts](#223-mapping-decomposition-to-scream-concepts) + - [2.3. The Flat-Bottom Strategy](#23-the-flat-bottom-strategy) + - [2.3.1. The Problem: Discrete Libraries vs. Continuous Reality](#231-the-problem-discrete-libraries-vs-continuous-reality) + - [2.3.2. The Solution: A "Forgiving" Potential](#232-the-solution-a-forgiving-potential) + +--- + +## 2.1. The Physical Model: Pairwise Non-Bonded Energy + +At its most fundamental level, the total non-bonded energy of a molecular system in SCREAM++ is modeled as the sum of all pairwise interactions between atoms. Bonded interactions (bonds, angles, dihedrals) are considered fixed within the rigid rotamers and backbone, and thus do not change during the side-chain placement optimization. + +### 2.1.1. The Fundamental Equation + +The total non-bonded energy, $E_{\text{total}}$, is given by the summation over all unique atom pairs $(i, j)$: + +$$ +E_{\mathrm{total}} = \sum_{i=1}^{N} \sum_{j=i+1}^{N} E_{\mathrm{pair}}(i,j) +$$ + +The pairwise energy $E_{\text{pair}}$ is composed of three terms, implemented in `core::forcefield::potentials`: + +$$ +E_{\text{pair}}(i, j) = E_{\text{vdw}}(i, j) + E_{\text{coulomb}}(i, j) + E_{\text{hbond}}(i, j) +$$ + +- **$E_{\text{vdw}}$**: Van der Waals interaction, typically modeled by a Lennard-Jones 12-6 or Buckingham potential. +- **$E_{\text{coulomb}}$**: Electrostatic interaction, modeled by Coulomb's law. +- **$E_{\text{hbond}}$**: A specialized hydrogen bond potential (e.g., Dreiding 12-10 with an angular term). + +### 2.1.2. The Five Atom Sets (Conceptual) + +For the side-chain placement problem, it is useful to conceptually divide all atoms in the system into five distinct sets. This helps in understanding which interactions change during optimization. + +**Figure 1: Conceptual Atom Sets** + +```mermaid +graph TD + subgraph MolecularSystem + S_mob("Mobile Sidechains (S_mob)
Sidechain atoms of residues being optimized.
Their positions change.") + B_mob("Mobile Backbone (B_mob)
Backbone atoms of residues being optimized.") + S_fix("Fixed Sidechains (S_fix)
Sidechain atoms of non-optimized residues.") + B_fix("Fixed Backbone (B_fix)
Backbone atoms of non-optimized residues.") + Env("Environment (Env)
Ligands, water, ions, etc.") + end +``` + +The key insight for optimization is that **only the coordinates of atoms in the $S_{\text{mob}}$ set change** when a new rotamer is placed. All other atoms are considered fixed in space. + +## 2.2. The Algorithmic Model: Decomposing for Efficiency + +Calculating the full pairwise sum $E_{\text{total}}$ at every optimization step is computationally prohibitive. The SCREAM++ engine uses a strategic decomposition of this energy to isolate the parts that change from those that remain constant. + +### 2.2.1. From Five Sets to Two: Active vs. Fixed + +For the algorithm, we simplify the five conceptual sets into two operational sets: + +- **The Active Set (A)**: This set contains all atoms whose positions are subject to change during optimization. In the side-chain placement problem, this is exclusively the **Mobile Sidechains ($S_{\text{mob}}$)**. + > $A = S_{\text{mob}}$ +- **The Fixed Set (F)**: This set contains all other atoms in the system, which form a static background or "lattice" against which the active atoms are evaluated. + > $F = B_{\text{mob}} \cup S_{\text{fix}} \cup B_{\text{fix}} \cup \text{Env}$ + +### 2.2.2. The Core Decomposition of Total Energy + +Using the Active (A) and Fixed (F) sets, the total energy can be rewritten as the sum of three distinct components, where $E(X, Y)$ denotes the sum of all pairwise interactions between atoms in set X and set Y: + +$$ +E_{\text{total}} = E(A, A) + E(A, F) + E(F, F) +$$ + +- **$E(F, F)$**: The internal energy of the fixed set. This term is **constant** throughout the optimization. +- **$E(A, F)$**: The interaction energy between the active set and the fixed set. +- **$E(A, A)$**: The internal interaction energy within the active set. + +The sum $E(A, A) + E(A, F)$ is the portion of the energy that changes during optimization. We define this as the **Optimization Score**. + +### 2.2.3. Mapping Decomposition to SCREAM++ Concepts + +The SCREAM++ engine directly implements this decomposition: + +- **Fixed Energy ($E_{\text{fixed}}$)**: + + - **Definition**: $E_{\text{fixed}} = E(F, F)$ + - **Description**: The constant energy offset of the system. It is calculated only once at the beginning of the workflow. + - **Implementation**: `engine::tasks::fixed_energy` + +- **Interaction Energy ($E_{\text{interaction}}$)**: + + - **Definition**: $E_{\text{interaction}} = E(A, A)$ + - **Description**: The energy arising from interactions _between_ the side-chains of active residues. This term captures how well the moving parts fit together. + - **Implementation**: `engine::tasks::interaction_energy` + +- **Empty Lattice Energy ($E_{\text{EL}}$)**: + - **Definition**: For a single active residue $i$ with a specific rotamer $r$, its Empty Lattice energy $E_{\text{EL}}(i_r)$ is defined as the energy of its side-chain atoms ($A_i$) interacting with the entire fixed set ($F$), plus the internal energy of the side-chain itself. $E_{\text{EL}}(i_r) = E(A_i, F) + E_{\text{internal}}(A_i)$ + - **Description**: This is the energy contribution of a single rotamer in the "empty lattice" of the fixed environment. Since this interaction is independent of other active rotamers, it can be pre-calculated for every possible rotamer of every active residue. + - **Implementation**: `engine::tasks::el_energy` pre-computes these values and stores them in the `ELCache`. + +Finally, the **Optimization Score** used by the engine is assembled as: + +$$ +E_{\text{optimization}} = E_{\text{interaction}} + \sum_{i \in A} E_{\text{EL}}(i) +$$ + +This decomposition is the key to the engine's performance. Instead of a full $O(N^2)$ calculation at each step, the engine can perform efficient $O(N)$ updates by leveraging pre-computed $E_{\text{EL}}$ values and incrementally updating $E_{\text{interaction}}$. This is discussed further in `04_performance_and_memory.md`. + +## 2.3. The Flat-Bottom Strategy + +The "Flat-Bottom Strategy" is the core scientific innovation of SCREAM, designed to address a fundamental limitation of discrete rotamer libraries. + +### 2.3.1. The Problem: Discrete Libraries vs. Continuous Reality + +Rotamer libraries provide a finite set of conformations for each side-chain. It is highly unlikely that any single rotamer in the library perfectly matches the true, continuous optimal position of the side-chain in the protein. This small discrepancy can lead to: + +- A slight steric clash (e.g., atoms are 2.9Å apart instead of an ideal 3.0Å). +- In a standard Lennard-Jones potential, this small clash results in a massive, disproportionate repulsive energy penalty. +- The optimization algorithm incorrectly discards this near-perfect rotamer in favor of a much worse one that avoids the minor clash. + +### 2.3.2. The Solution: A "Forgiving" Potential + +The Flat-Bottom Strategy modifies the repulsive part of the VDW and H-bond potentials to be more tolerant of these small inaccuracies. + +**Figure 2: Comparison of Standard vs. Flat-Bottom VDW Potential** + +![Flat-Bottom VDW Potential](./images/flat_bottom_vdw.svg) + +- **Mechanism**: A "flat bottom" of width $\Delta$ is introduced into the potential well. If an atom's distance $r$ falls within the range $[R_e - \Delta, R_e]$ (where $R_e$ is the ideal distance), the energy is clamped to the minimum value, $E(R_e)$. No penalty is applied for this minor clash. +- **The Delta ($\Delta$) Parameter**: The width of this tolerance region is not arbitrary. It is atom-specific and derived from the uncertainty of atom positions in the rotamer library. + $$ + \Delta = s \cdot \sigma + $$ + - **$\sigma$ (sigma)**: This value is pre-calculated and stored in `data/delta/delta-*.csv`. It represents the standard deviation of an atom's position from its ideal crystal structure location, based on the "coarseness" (diversity) of the rotamer library. Atoms further down the side-chain have a larger $\sigma$. + - **$s$ (s-factor)**: A global, user-configurable scaling factor (`--s-factor` or `s-factor` in config) that uniformly tunes the "forgiveness" of the potential. This is a critical parameter for balancing accuracy and conformational search. +- **Implementation**: This logic is encapsulated within `core::forcefield::potentials::apply_flat_bottom_vdw` and `apply_flat_bottom_hbond`. diff --git a/docs/dev/core/images/flat_bottom_vdw.svg b/docs/dev/core/images/flat_bottom_vdw.svg new file mode 100644 index 00000000..5db6e1bd --- /dev/null +++ b/docs/dev/core/images/flat_bottom_vdw.svg @@ -0,0 +1,2143 @@ + + + + + + + + 2025-09-08T02:41:43.824988 + image/svg+xml + + + Matplotlib v3.6.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 10caad852e7bbc4c7c794a6d97145af6a56c3d7e Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 22:22:03 -0700 Subject: [PATCH 69/83] docs(project): Add comprehensive documentation for algorithms and workflows in scream-core --- docs/dev/core/03_algorithms_and_workflows.md | 273 +++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 docs/dev/core/03_algorithms_and_workflows.md diff --git a/docs/dev/core/03_algorithms_and_workflows.md b/docs/dev/core/03_algorithms_and_workflows.md new file mode 100644 index 00000000..4cf64fa6 --- /dev/null +++ b/docs/dev/core/03_algorithms_and_workflows.md @@ -0,0 +1,273 @@ +# 3. Algorithms and Workflows + +This document provides a detailed, top-down explanation of the algorithms and workflows that power `scream-core`. It connects the high-level user-facing API to the underlying engine logic and core computational tasks, serving as a comprehensive guide for developers looking to understand, modify, or extend the library. + +**Table of Contents** + +- [3. Algorithms and Workflows](#3-algorithms-and-workflows) + - [3.1. The `place` Workflow: The Top-Level Conductor](#31-the-place-workflow-the-top-level-conductor) + - [3.1.1. High-Level Execution Flow](#311-high-level-execution-flow) + - [3.2. The Optimization Engine: A Deeper Dive](#32-the-optimization-engine-a-deeper-dive) + - [3.2.1. Phase 1: Preparation \& Pre-computation](#321-phase-1-preparation--pre-computation) + - [3.2.2. Phase 2: State Initialization](#322-phase-2-state-initialization) + - [3.2.3. Phase 3: The Iterative Optimization Loop (Clash Resolution)](#323-phase-3-the-iterative-optimization-loop-clash-resolution) + - [3.2.4. Phase 4: Search \& Refinement](#324-phase-4-search--refinement) + - [3.2.5. Phase 5: Finalization](#325-phase-5-finalization) + - [3.3. Core Algorithms and Data Structures in Action](#33-core-algorithms-and-data-structures-in-action) + - [3.3.1. The Incremental Energy Update Model with `EnergyGrid`](#331-the-incremental-energy-update-model-with-energygrid) + - [3.3.2. Doublet Optimization: Heuristic for Clash Resolution](#332-doublet-optimization-heuristic-for-clash-resolution) + - [3.3.3. The Transactional Model: Safe, Temporary System Modification with `SystemView`](#333-the-transactional-model-safe-temporary-system-modification-with-systemview) + +--- + +## 3.1. The `place` Workflow: The Top-Level Conductor + +The primary entry point for any side-chain placement task is the `workflows::place::run` function. From a developer's perspective, this function serves as a high-level "conductor," orchestrating the complex sequence of operations performed by the `engine` and `core` layers. + +### 3.1.1. High-Level Execution Flow + +The workflow can be conceptualized as a five-stage pipeline. Each stage builds upon the results of the previous one, progressively refining the state of the molecular system to find low-energy side-chain conformations. + +**Figure 1: High-Level `place` Workflow** + +```mermaid +graph TD + subgraph Inputs + A["MolecularSystem"]; + B["PlacementConfig"]; + end + + subgraph Workflow Stages + C["Stage 1: Setup Resources"]; + D["Stage 2: Initialize Engine State"]; + E["Stage 3: Run Iterative Optimization"]; + F["Stage 4: Perform Global Search & Refinement"]; + G["Stage 5: Finalize and Collect Results"]; + end + + subgraph Output + H["PlacementResult"]; + end + + A --> C; + B --> C; + C --> D; + D --> E; + E --> F; + F --> G; + G --> H; +``` + +- **Stage 1: Setup Resources**: The workflow begins by loading and preparing all necessary data based on the `PlacementConfig`. This includes loading the forcefield parameters, the specified rotamer library, and the residue topology registry. It also determines the set of `active_residues` to be optimized. +- **Stage 2: Initialize Engine State**: The engine computes baseline energies. The constant `E_fixed` is calculated, and the initial energy of the input conformation is determined. Crucially, the `ELCache` is populated by pre-calculating the Empty Lattice energy for every possible rotamer of every active residue. Finally, an initial `EnergyGrid` is built based on a "ground state" conformation. +- **Stage 3: Iterative Optimization**: This is the main loop for clash resolution. The engine repeatedly identifies the most severe steric/energetic clash in the system and attempts to resolve it by performing a **Doublet Optimization** on the two clashing residues. The system's energy is updated incrementally at each step. +- **Stage 4: Global Search & Refinement**: After the iterative clash resolution converges, optional, more computationally intensive algorithms are run. This includes **Simulated Annealing** to escape local energy minima and **Final Refinement** (Singlet Optimization) to perform a final greedy search for improvements. +- **Stage 5: Finalize and Collect Results**: The best solutions tracked by the `OptimizationState` are collected, sorted, and de-duplicated. The final `total_energy` for each solution is computed by adding the constant `E_fixed` back to the final `optimization_score`. The results are then returned in a structured `PlacementResult`. + +## 3.2. The Optimization Engine: A Deeper Dive + +This section breaks down the high-level workflow into the specific engine tasks and logic that execute each phase. + +### 3.2.1. Phase 1: Preparation & Pre-computation + +This phase sets the stage for the entire optimization process. + +**Figure 2: Preparation & Pre-computation Flow** + +```mermaid +graph TD + A["place::run(system, config)"] --> B["prepare_context"]; + B --> C["Resolve active_residues"]; + C --> D["el_energy::run"]; + D --> E["ELCache (Populated)"]; +``` + +- **`prepare_context` (in `workflows::place`)**: This internal function is the first step. It calls `engine::utils::query::resolve_selection_to_ids` to interpret the `ResidueSelection` criteria from the `PlacementConfig`. It translates user-friendly specifications (e.g., "all", "chain A residues 10-20") into a concrete `HashSet` of active residues, filtering out any that lack rotamers in the provided `RotamerLibrary`. +- **`engine::tasks::el_energy::run`**: This is the most significant pre-computation step. + - **Algorithm**: It iterates through every active residue and, for each one, every possible rotamer conformation available in the `RotamerLibrary`. This process is parallelized using `rayon` if the `parallel` feature is enabled. + - **Calculation**: For each `(residue, rotamer)` pair, it calculates its Empty Lattice Energy ($E_{\text{EL}}$). As defined in the previous document, this includes the rotamer's internal energy and its interaction energy with the entire fixed environment (backbone, non-active sidechains, etc.). + - **Output**: The results are stored in an `ELCache`, which acts as a lookup table: `(ResidueId, ResidueType, rotamer_idx) -> EnergyTerm`. + +### 3.2.2. Phase 2: State Initialization + +With pre-computations complete, the engine initializes the data structures that will manage the optimization state. + +**Figure 3: State Initialization Flow** + +```mermaid +graph TD + subgraph Inputs + A["Initial System"]; + B["ELCache"]; + end + + C["initialize_optimization_state"] --> D["Place ground-state rotamers"]; + D --> E["EnergyGrid::new"]; + E --> F["Full pairwise E_interaction calculation"]; + F --> G["Initial OptimizationState"]; + + A --> C; + B --> C; + C --> E; +``` + +- **`initialize_optimization_state` (in `workflows::place`)**: + - **Algorithm**: For each active residue, this function queries the `ELCache` to find the rotamer with the lowest $E_{\text{EL}}$. This set of lowest-energy rotamers constitutes the "ground state" conformation. + - **Action**: It then modifies a copy of the system by placing these ground-state rotamers onto their respective residues. +- **`engine::energy_grid::EnergyGrid::new`**: + - **Algorithm**: This constructor receives the system with ground-state rotamers. It performs a **one-time, full, all-pairs calculation** of the interaction energy ($E_{\text{interaction}}$) between the side-chains of all active residues. + - **Output**: It populates its internal data structures: `pair_interactions` (storing energy between each pair) and `total_residue_interactions` (storing the sum of interactions for each residue). This populated `EnergyGrid` is the starting point for all subsequent incremental updates. + - Finally, an `OptimizationState` is created to hold the initial state and track future solutions. + +### 3.2.3. Phase 3: The Iterative Optimization Loop (Clash Resolution) + +This is the core iterative refinement loop of the SCREAM algorithm. + +**Figure 4: Clash Resolution Loop** + +```mermaid +graph TD + Start --> A{Loop until convergence
or max_iterations}; + A --> B["clash_detection::run
Calculates all pairwise energies,
returns pairs > threshold"]; + B --> C{Any clashes?}; + C -- No --> G[End Loop]; + C -- Yes --> D["Select worst clash pair (A, B)"]; + D --> E["doublet_optimization::run(A, B)
Finds best rotamer pair (i*, j*) for A and B"]; + E --> F["Update System & EnergyGrid
with new rotamers (i*, j*)"]; + F --> A; +``` + +- **`engine::tasks::clash_detection::run`**: At the beginning of each iteration, this task uses the current `EnergyGrid`'s `pair_interactions` to quickly find all residue pairs whose interaction energy exceeds a predefined threshold (e.g., 25 kcal/mol). It returns a list of these `ClashPair`s, sorted with the most severe clash first. +- **`engine::tasks::doublet_optimization::run`**: The engine then focuses on the worst clash pair. This task performs an exhaustive search over all rotamer combinations for these two residues. Its goal is to find the pair of rotamers that minimizes the local energy (their mutual interaction energy plus their interactions with all other active residues). A detailed breakdown of this algorithm is in Section 3.2. +- **Update**: Once the best new rotamer pair is found, the `EnergyGrid` is updated. This is a highly efficient operation, as described in Section 3.1. The system state is modified, and the new total energy is submitted to the `OptimizationState`. +- **Convergence**: The loop terminates if no clashes are found, the maximum number of iterations is reached, or the best energy found does not improve significantly over a set number of "patience" iterations (`ConvergenceConfig`). + +### 3.2.4. Phase 4: Search & Refinement + +After the primary loop, these optional phases further polish the solution. + +- **Simulated Annealing (`run_simulated_annealing`)**: + - **Algorithm**: If enabled, this phase uses a Monte Carlo approach. It repeatedly picks a random residue, proposes a random new rotamer, and calculates the change in energy ($\Delta E$). The move is always accepted if $\Delta E < 0$. If $\Delta E > 0$, it is accepted with a probability of $P(\text{accept}) = e^{-\Delta E / T}$, where $T$ is the current "temperature". The temperature is gradually lowered according to a cooling schedule. + - **Purpose**: This allows the search to occasionally accept "worse" moves, giving it the ability to escape from local energy minima and explore a wider conformational space. + +**Figure 5: Simulated Annealing Flow** + +```mermaid +graph TD + Start --> A{Loop T > T_final}; + A -- T <= T_final --> End; + A --> B{Loop steps_per_T}; + B -- End Inner Loop --> G["T = T * cooling_rate"]; + G --> A; + B --> C["Select random residue & new rotamer"]; + C --> D["Calculate ΔE via EnergyGrid"]; + D --> E{"Accept move? (Metropolis criterion)"}; + E -- Yes --> F["Apply Move & Update State"]; + E -- No --> B; + F --> B; +``` + +- **Final Refinement (`final_refinement`)**: + - **Algorithm**: This is a greedy, iterative **Singlet Optimization**. It runs for a fixed number of passes (e.g., `final-refinement-iterations`). In each pass, it iterates through every active residue one by one. For each residue, it evaluates the energy of all its possible rotamers (given the current state of all other residues) and greedily selects the one that results in the lowest total energy. + - **Purpose**: To perform a final, local "polishing" of the best solution found so far. + +### 3.2.5. Phase 5: Finalization + +The final step is to prepare the results for the user. + +- **`finalize_results` (in `workflows::place`)**: + - **Algorithm**: This function retrieves the sorted list of best solutions from the `OptimizationState`'s `BinaryHeap`. + - **Logic**: It handles the `include_input_conformation` option by comparing the initial state's energy with the found solutions and inserting it into the list if it ranks high enough. It also performs de-duplication to remove solutions with nearly identical energies. + - **Energy Conversion**: It converts the final **Optimization Score** of each solution into a physically meaningful **Total Energy** by adding back the pre-calculated `E_fixed`. + +## 3.3. Core Algorithms and Data Structures in Action + +This section provides a deeper look into the key mechanisms that enable the engine's performance and correctness. + +### 3.3.1. The Incremental Energy Update Model with `EnergyGrid` + +The `EnergyGrid` is the performance centerpiece of the optimization engine. It transforms the computationally expensive problem of energy recalculation into a fast, incremental update. + +**Figure 6: Incremental Energy Update Flow** + +```mermaid +graph TD + A["Propose Move:
Residue i: rotamer rr'"] --> B["calculate_delta_for_move(i, r')"]; + subgraph B + B1["Use SystemView::transaction"]; + B2["Temporarily place rotamer r'"]; + B3["Calculate new E(i_r', j) for all j ≠ i"]; + B4["Calculate ΔE_interaction"]; + B5["Lookup ΔE_EL from ELCache"]; + B6["ΔE_optimization = ΔE_interaction + ΔE_EL"]; + B1-->B2-->B3-->B4-->B5-->B6; + end + B --> C["MoveDelta object"]; + C --> D["apply_move(MoveDelta)"]; + subgraph D + D1["Update total score: score += ΔE_optimization"]; + D2["Update E_EL(i)"]; + D3["Update pair_interactions for all pairs involving i"]; + D4["Update total_residue_interactions for all neighbors j"]; + end + D --> E["Updated EnergyGrid"]; +``` + +- **The Problem**: A naive approach would re-calculate all pairwise interactions ($O(N^2)$ pairs) after every single rotamer change. For a system with 200 active residues, this is nearly 20,000 pairs. +- **The Solution**: When a single residue `i` changes its rotamer, only its interactions with the other `N-1` active residues change. The interactions between any other pair `(j, k)` where `j,k ≠ i` remain constant. +- **`calculate_delta_for_move`**: This function computes the total change in energy without iterating over all pairs. + 1. It uses a `SystemView` transaction to temporarily place the new rotamer. + 2. It calculates the **new** interaction energies between the new rotamer and all other active side-chains ($O(N)$ calculations). + 3. It finds the change in interaction energy: $\Delta E_{\text{interaction}} = E_{\text{interaction}}^{\text{new}} - E_{\text{interaction}}^{\text{old}}$. The old value is efficiently retrieved from the `EnergyGrid`. + 4. It finds the change in Empty Lattice energy, $\Delta E_{\text{EL}}$, by a simple $O(1)$ lookup in the `ELCache`. + 5. The total change is $\Delta E_{\text{optimization}} = \Delta E_{\text{interaction}} + \Delta E_{\text{EL}}$. +- **`apply_move`**: This function takes the `MoveDelta` object and updates the `EnergyGrid`'s internal tables in $O(N)$ time, reflecting the new energy landscape. + +### 3.3.2. Doublet Optimization: Heuristic for Clash Resolution + +This task (`engine::tasks::doublet_optimization`) is the primary workhorse for resolving unfavorable interactions. + +**Figure 7: Doublet Optimization Algorithm** + +```mermaid +graph TD + Start(Input: res_a, res_b) --> A{"For each rotamer pair (i for a, j for b)"}; + A --> B["Heuristic Pruning:
If E_EL(a_i) + E_EL(b_j) is already
worse than the best total energy found,
skip this pair."]; + B -- Passes --> C["Calculate local energy:
E_local = E_EL(a_i) + E_EL(b_j) +
E_int(a_i, b_j) +
Σ E_int(a_i, others) +
Σ E_int(b_j, others)"]; + B -- Fails --> A; + C --> F{E_local < best_E_so_far?}; + F -- Yes --> G[Update best_E_so_far and best_pair]; + G --> A; + F -- No --> A; + A -- All pairs done --> End(best_pair); +``` + +- **Algorithm**: It performs a brute-force search over the cross-product of the rotamer sets for two residues, `res_a` and `res_b`. +- **Energy Calculation**: For each pair of rotamers `(i, j)`, it calculates a total local energy. This includes their pre-computed $E_{\text{EL}}$ values and their newly computed interaction energies with each other and with all other active residues in the system. +- **Optimization**: A critical performance heuristic is used. Before performing the expensive interaction energy calculations, it checks if the sum of the two rotamers' $E_{\text{EL}}$ is already greater than the best total energy found so far. If it is, this pair can be safely pruned from the search space, as the interaction energies can only add positive (or slightly negative) values. + +### 3.3.3. The Transactional Model: Safe, Temporary System Modification with `SystemView` + +The `SystemView` (`engine::transaction`) is a key piece of the engine's design, ensuring both correctness and performance. + +**Figure 8: `SystemView::transaction` Lifecycle** + +```mermaid +graph TD + A["Call transaction(res_id, action_closure)"] --> B["Store original_rotamer_idx of res_id"]; + B --> C["Execute user's `action_closure`"]; + subgraph "Inside Closure" + D["User code calls view.apply_move(res_id, new_idx)"]; + D --> E["System geometry is modified.
current_rotamers map is updated."]; + end + C --> E; + E --> F["Closure returns result"]; + F --> G["Compare current_rotamer_idx with original"]; + G --> H{Was it modified?}; + H -- Yes --> I["Revert system state by placing original rotamer.
Revert current_rotamers map."]; + I --> J["Return closure's result"]; + H -- No --> J; +``` + +- **The Problem**: During energy calculation for a proposed move (e.g., in `calculate_delta_for_move`), we need to see what the system _would_ look like with the new rotamer to calculate its interactions. However, we don't want to permanently commit this change until we know if the move will be accepted. Cloning the entire `MolecularSystem` for every check would be extremely slow and memory-intensive. +- **The Solution**: `SystemView` holds a mutable reference to the _single, shared_ `MolecularSystem`. The `transaction` method saves the state of the residue(s) of interest, allows the provided closure to make temporary modifications, and then **guarantees** that the original state is restored before the function returns. This provides the illusion of a temporary copy without the performance overhead. This is a powerful pattern that ensures the `EnergyGrid` and the `MolecularSystem` remain consistent and correct throughout the optimization. From 1c848780e31dc45b9716da357fee708771f32420 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 23:01:35 -0700 Subject: [PATCH 70/83] docs(project): Add performance and memory optimization strategies documentation --- docs/dev/core/03_algorithms_and_workflows.md | 10 +- docs/dev/core/04_performance_and_memory.md | 163 +++++++++++++++++++ 2 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 docs/dev/core/04_performance_and_memory.md diff --git a/docs/dev/core/03_algorithms_and_workflows.md b/docs/dev/core/03_algorithms_and_workflows.md index 4cf64fa6..743f59bd 100644 --- a/docs/dev/core/03_algorithms_and_workflows.md +++ b/docs/dev/core/03_algorithms_and_workflows.md @@ -216,11 +216,11 @@ graph TD - **The Problem**: A naive approach would re-calculate all pairwise interactions ($O(N^2)$ pairs) after every single rotamer change. For a system with 200 active residues, this is nearly 20,000 pairs. - **The Solution**: When a single residue `i` changes its rotamer, only its interactions with the other `N-1` active residues change. The interactions between any other pair `(j, k)` where `j,k ≠ i` remain constant. - **`calculate_delta_for_move`**: This function computes the total change in energy without iterating over all pairs. - 1. It uses a `SystemView` transaction to temporarily place the new rotamer. - 2. It calculates the **new** interaction energies between the new rotamer and all other active side-chains ($O(N)$ calculations). - 3. It finds the change in interaction energy: $\Delta E_{\text{interaction}} = E_{\text{interaction}}^{\text{new}} - E_{\text{interaction}}^{\text{old}}$. The old value is efficiently retrieved from the `EnergyGrid`. - 4. It finds the change in Empty Lattice energy, $\Delta E_{\text{EL}}$, by a simple $O(1)$ lookup in the `ELCache`. - 5. The total change is $\Delta E_{\text{optimization}} = \Delta E_{\text{interaction}} + \Delta E_{\text{EL}}$. + 1. It uses a `SystemView` transaction to temporarily place the new rotamer. + 2. It calculates the **new** interaction energies between the new rotamer and all other active side-chains ($O(N)$ calculations). + 3. It finds the change in interaction energy: $\Delta E_{\text{interaction}} = E_{\text{interaction}}^{\text{new}} - E_{\text{interaction}}^{\text{old}}$. The old value is efficiently retrieved from the `EnergyGrid`. + 4. It finds the change in Empty Lattice energy, $\Delta E_{\text{EL}}$, by a simple $O(1)$ lookup in the `ELCache`. + 5. The total change is $\Delta E_{\text{optimization}} = \Delta E_{\text{interaction}} + \Delta E_{\text{EL}}$. - **`apply_move`**: This function takes the `MoveDelta` object and updates the `EnergyGrid`'s internal tables in $O(N)$ time, reflecting the new energy landscape. ### 3.3.2. Doublet Optimization: Heuristic for Clash Resolution diff --git a/docs/dev/core/04_performance_and_memory.md b/docs/dev/core/04_performance_and_memory.md new file mode 100644 index 00000000..c118afab --- /dev/null +++ b/docs/dev/core/04_performance_and_memory.md @@ -0,0 +1,163 @@ +# 4. Performance and Memory + +Performance is a cornerstone of SCREAM++. This document details the key architectural and algorithmic choices made to ensure the library is both CPU-efficient and memory-conscious. These strategies allow SCREAM++ to tackle large and complex protein systems effectively. + +**Table of Contents** + +- [4. Performance and Memory](#4-performance-and-memory) + - [4.1. CPU Performance Strategies](#41-cpu-performance-strategies) + - [4.1.1. Algorithmic Efficiency: The Incremental Update Model](#411-algorithmic-efficiency-the-incremental-update-model) + - [4.1.2. Parallelism with Rayon](#412-parallelism-with-rayon) + - [4.1.3. Heuristic Pruning in Optimization Tasks](#413-heuristic-pruning-in-optimization-tasks) + - [4.2. Memory Efficiency Strategies](#42-memory-efficiency-strategies) + - [4.2.1. The Transactional Model: Avoiding Clones with `SystemView`](#421-the-transactional-model-avoiding-clones-with-systemview) + - [4.2.2. Data Representation and Ownership](#422-data-representation-and-ownership) + +--- + +## 4.1. CPU Performance Strategies + +CPU performance is optimized through a combination of algorithmic cleverness, parallelism, and heuristic shortcuts. + +### 4.1.1. Algorithmic Efficiency: The Incremental Update Model + +The single most important performance feature of the optimization engine is the incremental energy update model, orchestrated by the `EnergyGrid`. + +- **The Problem**: A naive side-chain optimization algorithm would re-calculate the total energy of the system after every single change to a rotamer. For a system with $N$ active residues, the total interaction energy involves approximately $N(N-1)/2$ pairwise calculations. A full recalculation is an $O(N^2)$ operation, which becomes prohibitively expensive for large systems within an iterative loop. + +- **The Solution**: The `EnergyGrid` transforms this into an $O(N)$ operation. When a single residue `i` changes its conformation, the only interaction energies that change are those involving residue `i`. All other pairwise interactions $(j, k)$ where $j, k \neq i$ remain unchanged. + +**Figure 1: Energy Update Complexity** + +```mermaid +graph TD + subgraph "Naive Approach (O(N^2) Recalculation)" + direction LR + A1("Res 1") --- A2("Res 2") + A1 --- A3("Res 3") + A1 --- A4("Res 4") + A1 --- A5("Res 5 (Changed)") + + A2 --- A3 + A2 --- A4 + A2 --- A5 + + A3 --- A4 + A3 --- A5 + + A4 --- A5 + + subgraph Legend_N ["Calculations: 10 (All Pairs)"] + end + end + + subgraph "EnergyGrid Approach (O(N) Update)" + direction LR + B1("Res 1") + B2("Res 2") + B3("Res 3") + B4("Res 4") + B5("Res 5 (Changed)") + + B1 -.-> B2 + B1 -.-> B3 + B1 -.-> B4 + + B2 -.-> B3 + B2 -.-> B4 + B3 -.-> B4 + + B5 -- Recalculate --> B1 + B5 -- Recalculate --> B2 + B5 -- Recalculate --> B3 + B5 -- Recalculate --> B4 + + subgraph Legend_E ["Calculations: 4 (N-1 Pairs)"] + end + end + + linkStyle 0,1,2,3,4,5,6,7,8,9 stroke-width:2px + linkStyle 10,11,12,13,14,15 stroke-width:1px,stroke-dasharray: 5 5 + linkStyle 16,17,18,19 stroke-width:2px +``` + +- **Implementation**: This is achieved through the `calculate_delta_for_move` and `apply_move` methods on the `EnergyGrid`. + 1. `calculate_delta_for_move` computes the change in energy ($\Delta E$) by summing only the changes in the $N-1$ pairs involving the moving residue. + 2. `apply_move` then updates the `EnergyGrid`'s internal tables (`pair_interactions`, `total_residue_interactions`) by applying these deltas, again in $O(N)$ time. + +This incremental model is the primary reason the engine can perform many thousands of optimization steps per second. + +### 4.1.2. Parallelism with Rayon + +Many of the most computationally intensive tasks in SCREAM++ are "embarrassingly parallel," meaning they can be broken down into many independent sub-problems that can be solved concurrently. We leverage the `rayon` crate to exploit this parallelism on multi-core processors. + +- **Where Parallelism is Used**: + + - **`el_energy::run`**: The calculation of Empty Lattice energies for each `(residue, rotamer)` pair is completely independent. This task is parallelized over the list of all pairs, providing a significant speedup during the initial setup phase. + - **`doublet_optimization::run`**: The search for the best pair of rotamers for two residues is parallelized over the cross-product of their rotamer libraries. Each pair `(rotamer_i, rotamer_j)` can be evaluated independently. + - **`interaction_energy::run`**: The initial, full calculation of all pairwise interactions is parallelized over the list of all unique residue pairs. + +- **How to Use**: Parallelism is enabled via the `parallel` feature flag in `Cargo.toml`. When enabled, the CLI automatically defaults to using all available CPU cores but can be controlled via the `-j`/`--threads` argument. The library code uses `par_iter()` (from `rayon::prelude::*`) instead of `iter()` where appropriate. + +### 4.1.3. Heuristic Pruning in Optimization Tasks + +In search algorithms like `doublet_optimization`, we can often "prune" large portions of the search space that cannot possibly lead to a better solution. + +- **The Heuristic**: Before calculating the expensive interaction energy for a rotamer pair `(a_i, b_j)`, we first check the sum of their pre-calculated Empty Lattice energies, $E_{\text{EL}}(a_i) + E_{\text{EL}}(b_j)$. Interaction energies are typically either repulsive (positive) or only weakly attractive. Therefore, if the sum of their $E_{\text{EL}}$ is already worse than the best total energy found so far, it is extremely unlikely that a favorable interaction energy could compensate. We can safely skip this pair and move to the next. + + > If `E_EL(a_i) + E_EL(b_j) + E_interaction_min_possible > E_best_found`, then prune. + +- **Impact**: This simple check can prune a significant fraction of the rotamer pairs, especially later in the optimization when a good `E_best_found` has been established. This drastically reduces the number of full energy calculations required. + +## 4.2. Memory Efficiency Strategies + +Memory efficiency is crucial for handling large protein systems and for enabling effective parallelism, as each thread may require its own working space. + +### 4.2.1. The Transactional Model: Avoiding Clones with `SystemView` + +The most significant memory optimization is the avoidance of cloning the `MolecularSystem` during "what-if" calculations. + +- **The Problem**: A straightforward way to calculate the energy of a proposed move would be to clone the entire `MolecularSystem`, apply the change to the clone, calculate its energy, and then discard it. For a large protein, `MolecularSystem` can be many megabytes in size. Cloning it thousands of times per second would lead to massive memory allocation overhead and poor cache performance. + +- **The Solution**: The `engine::transaction::SystemView` provides a safe way to perform temporary modifications on a _mutable reference_ to the single, shared `MolecularSystem`. + +**Figure 2: `SystemView` Memory Model vs. Cloning** + +```mermaid +graph TD + subgraph "Cloning Approach (High Memory Cost)" + S1["Original System"]; + M1["Propose Move 1"]; + S2["Clone 1"]; + M2["Propose Move 2"]; + S3["Clone 2"]; + S1 --> M1 --> S2; + S1 --> M2 --> S3; + end + subgraph "SystemView Approach (Low Memory Cost)" + SharedSystem["Single Mutable System"]; + T1["Transaction 1"]; + T2["Transaction 2"]; + SharedSystem -- "&mut" --> T1; + SharedSystem -- "&mut" --> T2; + T1 --> T1_M["Modify & Calculate"]; + T1_M --> T1_R["Revert"]; + T2 --> T2_M["Modify & Calculate"]; + T2_M --> T2_R["Revert"]; + end +``` + +- **Implementation**: The `transaction` and `transaction_doublet` methods on `SystemView` work by: + +1. Saving the original rotamer index (a `usize`) of the residue(s) being modified. +2. Executing a user-provided closure that is allowed to modify the system via `apply_move`. +3. After the closure finishes, it restores the original rotamer, reverting the system geometry to its initial state. + +- **Benefit**: The overhead per transaction is just a few bytes to store the original indices, rather than megabytes for a full clone. This drastically reduces memory pressure and allocation overhead. + +### 4.2.2. Data Representation and Ownership + +Rust's strict ownership and borrowing model provides memory safety guarantees that are invaluable in a complex scientific application. + +- **Clear Ownership**: The `MolecularSystem` is the unambiguous owner of all core structural data. Other components, like the `OptimizationContext` and `Scorer`, hold immutable references (`&`) to it. This prevents data races and dangling pointers that could occur in a C++ implementation with complex object lifecycles. +- **`slotmap` for Stable IDs**: As mentioned in `01_architecture_and_data_models.md`, using `slotmap` prevents memory-related bugs that arise from index invalidation after deletions, a common problem when using `Vec` indices as identifiers. From 18f1b4e792004c4ff6b2f41b13bc25e679600b3d Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 23:06:47 -0700 Subject: [PATCH 71/83] docs(project): Clarify energy update complexity explanation and enhance diagram details --- docs/dev/core/04_performance_and_memory.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/dev/core/04_performance_and_memory.md b/docs/dev/core/04_performance_and_memory.md index c118afab..c58e10bb 100644 --- a/docs/dev/core/04_performance_and_memory.md +++ b/docs/dev/core/04_performance_and_memory.md @@ -27,7 +27,7 @@ The single most important performance feature of the optimization engine is the - **The Solution**: The `EnergyGrid` transforms this into an $O(N)$ operation. When a single residue `i` changes its conformation, the only interaction energies that change are those involving residue `i`. All other pairwise interactions $(j, k)$ where $j, k \neq i$ remain unchanged. -**Figure 1: Energy Update Complexity** +**Figure 1: Visualizing Energy Update Complexity (Example: N=5 Active Residues)** ```mermaid graph TD @@ -81,6 +81,15 @@ graph TD linkStyle 16,17,18,19 stroke-width:2px ``` +**Explanation of the diagram:** + +- **Naive Approach**: When Residue 5 changes, the algorithm recalculates the energy for **all 10 pairs** (solid lines). The complexity grows quadratically with the number of residues. +- **EnergyGrid Approach**: + + - The interactions between residues 1, 2, 3, and 4 (dashed gray lines) are **not recalculated**. Their values are already stored in the `EnergyGrid`. + - Only the **4 pairs** involving the changed Residue 5 (solid blue lines) need to be recomputed. + - The total energy is then updated by subtracting the old interaction energies of Residue 5 and adding the new ones. This complexity grows linearly. + - **Implementation**: This is achieved through the `calculate_delta_for_move` and `apply_move` methods on the `EnergyGrid`. 1. `calculate_delta_for_move` computes the change in energy ($\Delta E$) by summing only the changes in the $N-1$ pairs involving the moving residue. 2. `apply_move` then updates the `EnergyGrid`'s internal tables (`pair_interactions`, `total_residue_interactions`) by applying these deltas, again in $O(N)$ time. From 70deb416756eccf9e5cf6bd86c2a275382613af1 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Sun, 7 Sep 2025 23:22:41 -0700 Subject: [PATCH 72/83] docs(project): Add comprehensive developer documentation for scream-core library --- docs/dev/core/README.md | 62 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/dev/core/README.md diff --git a/docs/dev/core/README.md b/docs/dev/core/README.md new file mode 100644 index 00000000..fc3a8d86 --- /dev/null +++ b/docs/dev/core/README.md @@ -0,0 +1,62 @@ +# `scream-core` Developer Documentation + +Welcome to the developer documentation for `scream-core`, the scientific heart of the SCREAM++ project. This library contains all the fundamental data structures, forcefield implementations, and optimization algorithms for protein side-chain placement. + +The purpose of this documentation is to explain the **"why"** behind the code—our architectural decisions, algorithmic strategies, and performance considerations. It is intended to be read alongside the in-code API documentation (`cargo doc`). + +## Core Philosophy + +The design of `scream-core` is guided by three core principles: + +1. **Layered Architecture**: A strict separation of concerns into three layers (`core`, `engine`, `workflows`) makes the library modular, testable, and extensible. Data representation is cleanly separated from algorithmic logic and high-level procedures. + +2. **Data-Centric Design**: Complex algorithms are built around high-performance data structures. The `MolecularSystem` provides a robust and safe representation of molecular data, while the `EnergyGrid` enables efficient, incremental energy calculations, which is the key to the engine's speed. + +3. **Performance by Design**: This is achieved through aggressive parallelism of independent tasks with `rayon`, efficient memory management via Rust's ownership model, and transactional, clone-free system modifications. + +## Recommended Reading Path + +For developers new to `scream-core`, we recommend reading the detailed documentation in the following order to build a comprehensive understanding of the library: + +1. **[Architecture and Data Models](./01_architecture_and_data_models.md)**: Start here to understand the foundational structure of the library and how molecular systems are represented in memory. + +2. **[Forcefield and Energy Calculation](./02_energy_calculation.md)**: Once you understand the data structures, this document explains the scientific and mathematical basis for how we calculate energies, including the core "Flat-Bottom Strategy." + +3. **[Algorithms and Workflows](./03_algorithms_and_workflows.md)**: This document connects the static data models and energy functions into a dynamic process. It details the step-by-step logic of the main side-chain placement algorithm. + +4. **[Performance and Memory](./04_performance_and_memory.md)**: Finally, read this to understand the key optimizations, parallelism strategies, and memory management techniques that make `scream-core` fast and efficient. + +## Documentation Index + +Here is a quick reference to the detailed documentation for each major component of the library. + +--- + +### **[1. Architecture and Data Models](./01_architecture_and_data_models.md)** + +- Explains the three-layer (`core`, `engine`, `workflows`) design philosophy. +- Details the structure of `MolecularSystem` and the rationale behind using `slotmap` for stable identifiers. +- Describes how structural knowledge is encoded in `TopologyRegistry` and `RotamerLibrary`. + +--- + +### **[2. Forcefield and Energy Calculation](./02_energy_calculation.md)** + +- Starts from the fundamental physics of pairwise non-bonded energies. +- Breaks down the total energy into algorithmically efficient components: `E_fixed`, `E_interaction`, and `E_EL`. +- Provides a detailed explanation of the core scientific concept: the **Flat-Bottom Strategy** and the role of the `Δ` parameter. + +--- + +### **[3. Algorithms and Workflows](./03_algorithms_and_workflows.md)** + +- Provides a step-by-step walkthrough of the main `place::run` workflow, from setup to final results. +- Details the core optimization loop, including clash detection, doublet optimization, and optional simulated annealing. +- Explains the key algorithms that enable performance, such as the incremental energy updates via `EnergyGrid` and the transactional `SystemView` model. + +--- + +### **[4. Performance and Memory](./04_performance_and_memory.md)** + +- Discusses CPU performance optimizations, including the $O(N)$ incremental energy model and parallelism with `rayon`. +- Explains memory efficiency strategies, focusing on the clone-free transactional model (`SystemView`) that avoids costly memory allocations. From 854cd0aeeb75b83db1db8f7e3f289797e73d7667 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 06:09:06 -0700 Subject: [PATCH 73/83] docs(project): Revise developer documentation for clarity and structure --- docs/dev/core/README.md | 58 ++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/docs/dev/core/README.md b/docs/dev/core/README.md index fc3a8d86..ebf0af22 100644 --- a/docs/dev/core/README.md +++ b/docs/dev/core/README.md @@ -1,62 +1,44 @@ # `scream-core` Developer Documentation -Welcome to the developer documentation for `scream-core`, the scientific heart of the SCREAM++ project. This library contains all the fundamental data structures, forcefield implementations, and optimization algorithms for protein side-chain placement. +This documentation details the internal architecture, data models, and algorithms of the `scream-core` library. It is intended for developers contributing to or building upon the library's scientific functionalities. For API-level details, refer to the rustdoc documentation. -The purpose of this documentation is to explain the **"why"** behind the code—our architectural decisions, algorithmic strategies, and performance considerations. It is intended to be read alongside the in-code API documentation (`cargo doc`). +## Core Design Principles -## Core Philosophy +The design of `scream-core` adheres to three principles: -The design of `scream-core` is guided by three core principles: - -1. **Layered Architecture**: A strict separation of concerns into three layers (`core`, `engine`, `workflows`) makes the library modular, testable, and extensible. Data representation is cleanly separated from algorithmic logic and high-level procedures. - -2. **Data-Centric Design**: Complex algorithms are built around high-performance data structures. The `MolecularSystem` provides a robust and safe representation of molecular data, while the `EnergyGrid` enables efficient, incremental energy calculations, which is the key to the engine's speed. - -3. **Performance by Design**: This is achieved through aggressive parallelism of independent tasks with `rayon`, efficient memory management via Rust's ownership model, and transactional, clone-free system modifications. - -## Recommended Reading Path - -For developers new to `scream-core`, we recommend reading the detailed documentation in the following order to build a comprehensive understanding of the library: - -1. **[Architecture and Data Models](./01_architecture_and_data_models.md)**: Start here to understand the foundational structure of the library and how molecular systems are represented in memory. - -2. **[Forcefield and Energy Calculation](./02_energy_calculation.md)**: Once you understand the data structures, this document explains the scientific and mathematical basis for how we calculate energies, including the core "Flat-Bottom Strategy." - -3. **[Algorithms and Workflows](./03_algorithms_and_workflows.md)**: This document connects the static data models and energy functions into a dynamic process. It details the step-by-step logic of the main side-chain placement algorithm. - -4. **[Performance and Memory](./04_performance_and_memory.md)**: Finally, read this to understand the key optimizations, parallelism strategies, and memory management techniques that make `scream-core` fast and efficient. +1. **Separation of Concerns**: A strict three-layer architecture isolates data representation (`core`), stateful logic (`engine`), and high-level procedures (`workflows`). +2. **Performance by Design**: The architecture employs an incremental energy update model, parallelism, and a transactional memory system to ensure CPU and memory efficiency. +3. **Safety and Correctness**: Rust's ownership model is leveraged to eliminate memory-related bugs common in scientific computing. ## Documentation Index -Here is a quick reference to the detailed documentation for each major component of the library. +The following documents provide a comprehensive overview of the library's internals. --- -### **[1. Architecture and Data Models](./01_architecture_and_data_models.md)** +### **1. [Architecture and Data Models](./01_architecture_and_data_models.md)** -- Explains the three-layer (`core`, `engine`, `workflows`) design philosophy. -- Details the structure of `MolecularSystem` and the rationale behind using `slotmap` for stable identifiers. -- Describes how structural knowledge is encoded in `TopologyRegistry` and `RotamerLibrary`. +- The **Three-Layer Architecture** (Core, Engine, Workflows). +- Core data structures, including `MolecularSystem` and the rationale for `slotmap`-based identifiers. +- The role of `TopologyRegistry` and `RotamerLibrary`. --- -### **[2. Forcefield and Energy Calculation](./02_energy_calculation.md)** +### **2. [Forcefield and Energy Calculation](./02_energy_calculation.md)** -- Starts from the fundamental physics of pairwise non-bonded energies. -- Breaks down the total energy into algorithmically efficient components: `E_fixed`, `E_interaction`, and `E_EL`. -- Provides a detailed explanation of the core scientific concept: the **Flat-Bottom Strategy** and the role of the `Δ` parameter. +- The decomposition of the physical pairwise energy model into algorithmically efficient components: `Fixed Energy`, `Interaction Energy`, and `Empty Lattice (EL) Energy`. +- The theoretical basis and implementation of the **Flat-Bottom Strategy**. --- -### **[3. Algorithms and Workflows](./03_algorithms_and_workflows.md)** +### **3. [Algorithms and Workflows](./03_algorithms_and_workflows.md)** -- Provides a step-by-step walkthrough of the main `place::run` workflow, from setup to final results. -- Details the core optimization loop, including clash detection, doublet optimization, and optional simulated annealing. -- Explains the key algorithms that enable performance, such as the incremental energy updates via `EnergyGrid` and the transactional `SystemView` model. +- Step-by-step analysis of the main `place::run` workflow. +- Detailed breakdown of core algorithms: **Doublet Optimization**, **Simulated Annealing**, and the **Incremental Energy Update Model** (`EnergyGrid`). --- -### **[4. Performance and Memory](./04_performance_and_memory.md)** +### **4. [Performance and Memory](./04_performance_and_memory.md)** -- Discusses CPU performance optimizations, including the $O(N)$ incremental energy model and parallelism with `rayon`. -- Explains memory efficiency strategies, focusing on the clone-free transactional model (`SystemView`) that avoids costly memory allocations. +- CPU optimization strategies, including **parallelism with Rayon** and heuristic pruning. +- Memory efficiency through the **transactional model (`SystemView`)**, which avoids expensive cloning of the `MolecularSystem`. From d80a73e7c594dd08bf12398ef6389c95c12e5a99 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 06:30:21 -0700 Subject: [PATCH 74/83] docs(cli): Add comprehensive developer documentation for scream-cli crate --- docs/dev/cli/README.md | 203 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 docs/dev/cli/README.md diff --git a/docs/dev/cli/README.md b/docs/dev/cli/README.md new file mode 100644 index 00000000..4ba1c3fa --- /dev/null +++ b/docs/dev/cli/README.md @@ -0,0 +1,203 @@ +# `scream-cli` Developer Documentation + +This document provides a comprehensive technical breakdown of the `scream-cli` crate. It details the flow of execution, configuration handling, data management, and the user interface system. It is intended for developers contributing to or extending the CLI's functionality. + +For user-facing instructions on how to _use_ the CLI, see the [User Manual (`/docs/cli/USAGE.md`)](/docs/cli/USAGE.md). + +**Table of Contents** + +- [`scream-cli` Developer Documentation](#scream-cli-developer-documentation) + - [1. Component Overview](#1-component-overview) + - [2. Execution Flow of a `scream place` Command](#2-execution-flow-of-a-scream-place-command) + - [3. The Layered Configuration System](#3-the-layered-configuration-system) + - [3.1. The Three Layers of Configuration](#31-the-three-layers-of-configuration) + - [3.2. The Build Process in `config::builder`](#32-the-build-process-in-configbuilder) + - [4. Data Management with `DataManager`](#4-data-management-with-datamanager) + - [4.1. Data Path Resolution](#41-data-path-resolution) + - [4.2. Logical Name Resolution](#42-logical-name-resolution) + - [4.3. Data Downloading](#43-data-downloading) + - [5. User Interface (UI) and Logging](#5-user-interface-ui-and-logging) + - [5.1. Decoupled Architecture with `tokio::mpsc`](#51-decoupled-architecture-with-tokiompsc) + - [5.2. Progress Reporting](#52-progress-reporting) + - [5.3. Logging Integration](#53-logging-integration) + +--- + +## 1. Component Overview + +The `scream-cli` crate is structured into several modules, each with a distinct responsibility, to create a maintainable and extensible command-line application. + +- **`main.rs`**: The application entry point. It is responsible for setting up the `tokio` asynchronous runtime, initializing global error handling (`color-eyre`), parsing command-line arguments, and spawning the main UI manager task. + +- **`cli.rs`**: Defines the entire command-line interface structure using `clap`'s derive macros. This file is the single source of truth for all commands (`place`, `data`), subcommands, arguments, and help messages. + +- **`commands/`**: Contains the primary business logic for each subcommand. + + - `place.rs`: Orchestrates the entire side-chain placement process, from configuration building to calling `scream-core` and processing the results. + - `data.rs`: Implements the logic for the `scream data` subcommands (`download`, `path`, etc.). + +- **`config/`**: The configuration engine. + + - `file.rs`: Defines the Rust structs that map directly to the `config.toml` file format using `serde`. + - `defaults.rs`: Provides hardcoded, sensible fallback values for all configurable parameters. + - `builder.rs`: Implements the core logic for merging the three configuration layers (defaults, file, and CLI arguments) into a final, validated `PlacementConfig` struct for `scream-core`. + +- **`data.rs`**: Implements the `DataManager`, a crucial abstraction that handles the physical location, resolution of logical names, and downloading of external data files (forcefields, rotamer libraries). + +- **`ui.rs`**: Manages all terminal output. It uses the `indicatif` crate for progress bars and runs in a separate `tokio` task. It receives events to display from both the core library and the logging system via a shared channel. + +- **`logging.rs`**: Configures the `tracing` subscriber framework. Its key feature is a custom `tracing::Layer` (`ChannelLayer`) that intercepts log messages and forwards them to the `UiManager` for display, ensuring that logging does not interfere with progress bar rendering. + +- **`error.rs`**: Defines the CLI-specific error enum (`CliError`), which centralizes error handling for the application. + +## 2. Execution Flow of a `scream place` Command + +A typical `scream place` command follows a well-defined sequence of operations, translating user input into a scientific result. + +**Figure 1: `scream place` Execution Flow** + +```mermaid +sequenceDiagram + participant User + participant CLI (main.rs) + participant ConfigBuilder as Config Builder (config::builder) + participant DataManager (data.rs) + participant Core Library (scream-core) + participant UI Manager (ui.rs) + + User->>+CLI: executes `scream place ...` + CLI->>CLI: `clap::parse()` command-line args + CLI->>+UI Manager: Spawn UiManager task + CLI->>+Config Builder: `build_config(args)` + Config Builder->>+DataManager: Resolve logical names (e.g., 'charmm@rmsd-1.0') + DataManager-->>-Config Builder: Return concrete file paths + Config Builder-->>-CLI: Return final `PlacementConfig` + CLI->>+Core Library: `workflows::place::run(system, config, reporter)` + Note right of Core Library: Emits `Progress` events to
UI Manager via reporter callback + Core Library-->>-CLI: Return `PlacementResult` + CLI->>CLI: Process results & write output file(s) + CLI->>-UI Manager: Send shutdown signal +``` + +1. **Parsing**: `main.rs` uses `clap` to parse all command-line arguments into the `Cli` struct. +2. **UI Initialization**: The `UiManager` is spawned as a separate asynchronous `tokio` task to handle all terminal rendering independently of the main computation. +3. **Configuration Building**: `commands::place::run` calls `config::builder::build_config`, passing the parsed arguments. +4. **Path Resolution**: The `Config Builder` uses the `DataManager` to resolve any "logical names" for data files (like `'charmm@rmsd-1.0'`) into absolute file paths. +5. **Core Invocation**: A final, validated `PlacementConfig` is constructed and passed to the `scream_core::workflows::place::run` function. The computationally intensive work happens here, executed within a `tokio::task::spawn_blocking` call to avoid blocking the async runtime. +6. **Progress Reporting**: During execution, the core library sends `Progress` events back to the `UiManager` via a callback, which updates the progress bars in the terminal. +7. **Result Handling**: Once `place::run` returns a `PlacementResult`, the `commands::place` module formats the summary, prints it to the console, and writes the resulting molecular structure(s) to the output file(s) specified by the user. + +## 3. The Layered Configuration System + +The CLI's configuration system is designed to be flexible and predictable, merging settings from three distinct sources with a clear order of precedence. + +### 3.1. The Three Layers of Configuration + +Settings are determined by the first layer in which they are found, following this hierarchy: + +**Figure 2: Configuration Priority** + +```mermaid +graph TD + subgraph Priority + direction LR + A["Layer 1:
CLI Arguments
(Highest Priority)"] --> B["Layer 2:
`config.toml` File"]; + B --> C["Layer 3:
Built-in Defaults
(Lowest Priority)"]; + end +``` + +1. **Layer 1: Command-Line Arguments**: Any argument provided directly on the command line (e.g., `--s-factor 1.2`, `-n 5`) always overrides settings from other layers. This is ideal for quick experiments and scripting. +2. **Layer 2: TOML Configuration File**: The `config.toml` file is the primary method for specifying complex or persistent settings, such as the `[residues-to-optimize]` table. +3. **Layer 3: Built-in Defaults (`config::defaults`)**: These are hardcoded fallback values that ensure the program can run with minimal user input. + +### 3.2. The Build Process in `config::builder` + +The `config::builder::build_config` function is the orchestrator of this merging logic. Its process is as follows: + +1. It starts with the `DefaultsConfig` struct. +2. It loads the `FileConfig` struct by parsing the TOML file specified via `--config`. If no file is given, an empty `FileConfig` is used. +3. It applies any `--set KEY=VALUE` arguments, directly modifying the in-memory `FileConfig` struct. +4. It iterates through every parameter required by `scream-core`'s `PlacementConfig`. For each parameter, it checks for a value in this order: command-line argument, `FileConfig` struct, and finally the `DefaultsConfig`. +5. During this process, it uses the `DataManager` to translate any string-based "logical names" into verified `PathBuf`s. +6. Finally, it constructs and returns the fully validated `PlacementConfig` object. + +## 4. Data Management with `DataManager` + +The `DataManager` (`data.rs`) is a critical abstraction that decouples the application logic from the physical storage of data files like forcefields and rotamer libraries. + +### 4.1. Data Path Resolution + +The `DataManager` determines the root directory for all data files using a two-step process: + +1. It first looks for a configuration file in an OS-specific config location (e.g., `~/.config/screampp/path.conf` on Linux). This file can be created and modified by the user via `scream data set-path` and `reset-path`. +2. If this file does not exist, it falls back to an OS-specific default data directory (e.g., `~/.local/share/screampp` on Linux). This logic is handled by the `directories-rs` crate. + +This ensures a predictable and platform-idiomatic location for data while still allowing user customization. + +### 4.2. Logical Name Resolution + +To provide a user-friendly experience, the CLI uses "logical names" instead of requiring full file paths for common data files. + +- **The Problem**: A user should not have to type `--rotamer-library /path/to/data/rotamers/charmm/rmsd-1.0.toml`. +- **The Solution**: + 1. The `utils::parser` module contains functions that parse user-friendly strings like `'charmm@rmsd-1.0'` into a structured representation (e.g., `RotamerLibraryName { scheme: "charmm", diversity: "rmsd-1.0" }`). + 2. The `DataManager::resolve_logical_name` method takes this structured data and constructs the full, platform-correct file path within the resolved data directory. It also performs an existence check on the final path. + +### 4.3. Data Downloading + +The `scream data download` command automates the acquisition of required data. + +- **Process**: It uses the `reqwest` library to fetch a version-matched `.tar.zst` archive from the official SCREAM++ GitHub Releases page. The downloaded data is streamed into memory, decompressed on-the-fly using the `zstd` crate, and unpacked into the data directory using the `tar` crate. This entire process is visualized with an `indicatif` progress bar. + +## 5. User Interface (UI) and Logging + +The CLI's user interface is designed to be responsive and non-blocking, providing clear feedback without slowing down the core computation. + +### 5.1. Decoupled Architecture with `tokio::mpsc` + +The UI is architected around an asynchronous, event-driven model. + +**Figure 3: UI and Logging Event Flow** + +```mermaid +graph TD + subgraph "Core Computation (Blocking Thread)" + A["scream-core Reporter"]; + end + + subgraph "CLI Main Thread" + B["tracing::Layer"]; + end + + subgraph "UI Task (Async)" + E["UiManager"]; + F["indicatif MultiProgress"]; + E --> F; + end + + C["tokio::mpsc::channel"]; + + A -- "Progress Event" --> C; + B -- "Log Event" --> C; + C -- "UiEvent" --> E; + F --> G[Terminal Output]; +``` + +- **Decoupling**: The main computational logic (in `scream-core`) runs in a blocking thread pool managed by `tokio::task::spawn_blocking`. The `UiManager` runs in a separate, non-blocking asynchronous task. +- **Communication**: They communicate via a `tokio::mpsc` (multi-producer, single-consumer) channel. This allows the computationally-heavy core library and the logging system to send `UiEvent` messages to the `UiManager` without waiting for the terminal to render. + +### 5.2. Progress Reporting + +- A `CliProgressHandler` is created in the CLI. It holds a sender handle to the mpsc channel. +- It generates a `ProgressCallback` closure (a `Box`), which captures the channel sender. +- This callback is passed down into `scream-core` as part of the `ProgressReporter`. +- When the core library needs to report progress (e.g., `Progress::TaskIncrement`), it calls the callback. The callback's only job is to send the `Progress` event into the channel asynchronously. +- The `UiManager`'s main loop receives the event and updates the `indicatif` progress bars accordingly. + +### 5.3. Logging Integration + +To prevent log messages from interfering with the dynamically rendered progress bars, the logging system is integrated into the same UI event loop. + +- **`logging::ChannelLayer`**: A custom `tracing::Layer` is implemented. +- **Mechanism**: Instead of writing formatted log messages directly to `stdout` or `stderr`, this layer intercepts each log event from the `tracing` framework. It formats the message and sends it as a `UiEvent::Log` over the _same_ mpsc channel used for progress events. +- **Benefit**: The `UiManager` receives both log messages and progress updates through a single stream. It can then use `indicatif`'s `MultiProgress::println` method to print the log message _above_ the active progress bars, ensuring a clean, non-flickering, and readable terminal output. From 34a5c88b4df2305b10b2781b7155eb105631e41a Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 06:59:56 -0700 Subject: [PATCH 75/83] docs(project): Enhance README with detailed usage instructions and future interface plans --- README.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d436823e..9500faea 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,120 @@ The core mission of SCREAM++ is to provide a robust, reliable, and easy-to-use t - **Flexible Optimization Modes**: Supports both global side-chain optimization and focused refinement within a defined **binding site** or region of interest. - **Modern Tooling**: Managed by Cargo for simple, reproducible builds and dependency management. - **Multiple Interfaces**: - - **Standalone CLI**: An easy-to-use command-line interface for standard prediction tasks. - - **Python Package**: A user-friendly Python API (via PyO3) for scripting, integration, and advanced workflows. - - **C-Compatible Library**: A C FFI layer for integration with C, C++, and other programming languages. - - **Native Rust Crate**: A core Rust library (`screampp`) available on crates.io for direct use in other Rust-based scientific computing projects. + - **Standalone CLI**: A powerful and easy-to-use command-line interface for all prediction tasks. + - **Native Rust Crate**: The core library (`screampp`) is available on [crates.io](https://crates.io/crates/screampp) for direct use in other Rust-based scientific computing projects. + - **Python Package (Future)**: A user-friendly Python API (via PyO3) for scripting and integration is planned for future releases. + - **C-Compatible Library (Future)**: A C FFI layer for integration with C, C++, and other languages is planned for future releases. + +## Getting Started + +There are two main ways to use SCREAM++: as a standalone command-line tool or as a library in your own Rust project. + +### 1. Using the Command-Line Interface (CLI) + +This is the recommended method for most users. + +#### Step 1: Download the executable + +Go to the [**GitHub Releases page**](https://github.com/caltechmsc/screampp/releases) and download the pre-compiled binary for your operating system (Linux, macOS, Windows). Unzip the archive. + +#### Step 2: Download the required data files + +The first time you run the CLI, you must download the necessary forcefield and rotamer library files. This is a one-time setup. + +```bash +# On Linux/macOS +./scream data download + +# On Windows (Command Prompt) +scream.exe data download +``` + +This command will fetch and unpack the data to a default location on your system. + +#### Step 3: Run a prediction + +You are now ready to run a side-chain placement job. + +```bash +# Optimize all side-chains in input.bgf and save the result +./scream place -i path/to/input.bgf -o path/to/output.bgf +``` + +For detailed instructions on all commands, options, and advanced configuration, please refer to the [**CLI User Manual**](docs/cli/USAGE.md). + +### 2. Using as a Rust Library + +If you are a Rust developer, you can add `screampp` as a dependency to your project. + +#### Step 1: Add to `Cargo.toml` + +```toml +[dependencies] +screampp = "0.5.0" +``` + +#### Step 2: Use in your code + +You can now use the high-level `workflows` API to perform side-chain placement programmatically. + +```rust +use screampp::core::io::bgf::BgfFile; +use screampp::engine::config::{PlacementConfigBuilder, ResidueSelection, ConvergenceConfig}; +use screampp::engine::progress::ProgressReporter; +use screampp::workflows::place; + +fn run_placement() -> Result<(), Box> { + let (system, metadata) = BgfFile::read_from_path("path/to/input.bgf")?; + + let config = PlacementConfigBuilder::new() + .forcefield_path("path/to/data/forcefield/dreiding-lj-12-6-0.4.toml") + .delta_params_path("path/to/data/delta/delta-rmsd-1.0.csv") + .s_factor(1.1) + .rotamer_library_path("path/to/data/rotamers/charmm@rmsd-1.0.toml") + .topology_registry_path("path/to/data/topology/registry.toml") + .residues_to_optimize(ResidueSelection::All) + .max_iterations(100) + .num_solutions(1) + .include_input_conformation(true) + .final_refinement_iterations(2) + .convergence_config(ConvergenceConfig { + energy_threshold: 0.01, + patience_iterations: 5, + }) + .build()?; + + let reporter = ProgressReporter::new(); // Or provide a callback for progress updates + let result = place::run(&system, &config, &reporter)?; + + if let Some(best_solution) = result.solutions.first() { + println!("Best energy: {:.2} kcal/mol", best_solution.total_energy); + } + + Ok(()) +} +``` + +## Documentation + +Comprehensive documentation is available for both users and developers. + +- **For Users**: + + - [**CLI User Manual**](docs/cli/USAGE.md): A complete guide to installing and using the `scream` command-line tool, including detailed explanations of all commands, configuration options, and practical examples. + +- **For Developers**: + + - [**Rust Library API Docs (docs.rs)**](https://docs.rs/screampp): The official, versioned API documentation for the `scream-core` (`screampp`) crate, generated by `rustdoc`. This is the best resource for understanding the public API of the library. + - **Developer Documentation**: In-depth documentation covering the architecture, data models, algorithms, and design philosophy of the entire project. This is essential reading for anyone looking to contribute to or deeply understand the internals of SCREAM++. + - [**`scream-core` Developer Docs**](docs/dev/core/README.md): Details the internal architecture, data models, and algorithms of the `scream-core` library. + - [**`scream-cli` Developer Docs**](docs/dev/cli/README.md): Provides a comprehensive technical breakdown of the `scream-cli` crate, including execution flow, configuration handling, and data management. ## Tech Stack -- **Language**: Rust -- **Supported Languages**: Rust, Python (via PyO3), C/C++ (via FFI) +- **Core Language**: Rust - **Build System**: Cargo +- **Planned Interfaces**: Python (via PyO3), C/C++ (via FFI) ## License From 27488abcafc63ce06a99619bc03fa03dfaec71ed Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 15:45:10 -0700 Subject: [PATCH 76/83] docs(project): Revise library documentation for clarity and architectural details --- crates/scream-core/src/lib.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/crates/scream-core/src/lib.rs b/crates/scream-core/src/lib.rs index 83e221ec..f447502a 100644 --- a/crates/scream-core/src/lib.rs +++ b/crates/scream-core/src/lib.rs @@ -1,17 +1,24 @@ //! # SCREAM++ Core Library //! -//! A modernized, high-performance library for protein side-chain placement and structure redesign. +//! A modernized, high-performance library for protein side-chain placement and structure redesign, +//! based on the scientific principles of the SCREAM method. //! -//! ## Architecture +//! ## Architectural Philosophy //! -//! The library is structured into three primary public modules, designed to be used at different -//! levels of abstraction: +//! The library is designed with a strict three-layer architecture to ensure a clear separation of concerns, +//! making it modular, testable, and extensible. //! -//! - [`workflows`]: The highest-level API. +//! - **[`core`]: The Foundation.** Contains stateless data models (`MolecularSystem`), +//! pure mathematical representations of the forcefield (`potentials`, `scoring`), and I/O utilities. //! -//! - [`engine`]: The configuration and state management layer. +//! - **[`engine`]: The Logic Core.** This stateful layer orchestrates the optimization process. +//! It includes high-performance data structures like `EnergyGrid` for incremental updates, +//! `SystemView` for transactional modifications, and the implementation of optimization +//! algorithms (e.g., `doublet_optimization`). //! -//! - [`core`]: The foundational layer. It contains the fundamental data models, file I/O traits, and lower-level computational tools. +//! - **[`workflows`]: The Public API.** This is the highest-level, user-facing layer. It ties the +//! `engine` and `core` together to execute complete scientific procedures, such as side-chain +//! placement. It provides a simple and powerful entry point for end-users of the library. pub mod core; pub mod engine; From 15e36d9759f9f14579f1c51aabcd639b43ef47f1 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 15:54:06 -0700 Subject: [PATCH 77/83] docs(core): Enhance flat-bottom potential function documentation for clarity and detail --- .../src/core/forcefield/potentials.rs | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/crates/scream-core/src/core/forcefield/potentials.rs b/crates/scream-core/src/core/forcefield/potentials.rs index 5c1d2019..d0edbbcc 100644 --- a/crates/scream-core/src/core/forcefield/potentials.rs +++ b/crates/scream-core/src/core/forcefield/potentials.rs @@ -117,20 +117,26 @@ pub fn dreiding_hbond_12_10(dist_ad: f64, r_hb: f64, d_hb: f64) -> f64 { /// Applies a flat-bottom modification to a van der Waals potential function. /// -/// This function modifies the behavior of a potential in the repulsive region by -/// creating a flat energy well around the ideal distance, which can improve -/// numerical stability in molecular dynamics simulations. +/// This function implements the "Flat-Bottom Strategy" central to the SCREAM method. +/// It addresses the issue of discrete rotamer libraries, where the best available rotamer +/// may have minor steric clashes with its environment. A standard potential (like Lennard-Jones) +/// would impose a large, unrealistic energy penalty for such small clashes. +/// +/// This potential creates a "forgiveness" zone of width `delta` around the ideal +/// interaction distance (`ideal_dist`). If the distance falls within `[ideal_dist - delta, ideal_dist]`, +/// the energy is clamped to the potential minimum, applying no penalty. For distances +/// shorter than this, the repulsive wall is effectively shifted by `delta`. /// /// # Arguments /// /// * `dist` - The actual distance between atoms. -/// * `ideal_dist` - The ideal equilibrium distance. -/// * `delta` - The width of the flat-bottom region. -/// * `potential_fn` - The base potential function to modify. +/// * `ideal_dist` - The ideal equilibrium distance (e.g., R_min in Lennard-Jones). +/// * `delta` - The width of the flat-bottom region, calculated as `s * σ`. +/// * `potential_fn` - A closure representing the base potential function (e.g., Lennard-Jones 12-6). /// /// # Return /// -/// Returns the modified potential energy. +/// Returns the modified van der Waals potential energy. #[inline] pub fn apply_flat_bottom_vdw(dist: f64, ideal_dist: f64, delta: f64, potential_fn: F) -> f64 where @@ -159,20 +165,27 @@ where /// Applies a flat-bottom modification to a hydrogen bond potential function. /// -/// This function creates a flat energy region around the ideal hydrogen bond -/// distance to stabilize the interaction while maintaining the correct asymptotic -/// behavior at long and short ranges. +/// This function implements the "Flat-Bottom Strategy" for the hydrogen bond term, +/// sharing the same motivation as its van der Waals counterpart: to accommodate small +/// inaccuracies from discrete rotamer libraries. It prevents penalizing near-optimal +/// polar contacts that are slightly too close or too far from their ideal distance. +/// +/// The mechanism is distinct from the VDW modification. Instead of only shifting the +/// repulsive wall, this function widens the entire potential well. Both the inner (repulsive) +/// and outer (attractive) walls of the potential are shifted inwards by `delta`. This creates a +/// flat energy minimum in the range `[ideal_dist - delta, ideal_dist + delta]`, +/// making the potential more "forgiving" for interactions around the optimal geometry. /// /// # Arguments /// -/// * `dist` - The actual distance between atoms. -/// * `ideal_dist` - The ideal hydrogen bond distance. -/// * `delta` - The width of the flat-bottom region. -/// * `potential_fn` - The base potential function to modify. +/// * `dist` - The actual distance between the donor and acceptor heavy atoms. +/// * `ideal_dist` - The ideal equilibrium distance for the hydrogen bond (e.g., R_hb). +/// * `delta` - The distance to shift both the inner and outer walls, widening the well. +/// * `potential_fn` - A closure representing the base H-bond potential (e.g., Dreiding 12-10). /// /// # Return /// -/// Returns the modified potential energy. +/// Returns the modified hydrogen bond potential energy. #[inline] pub fn apply_flat_bottom_hbond(dist: f64, ideal_dist: f64, delta: f64, potential_fn: F) -> f64 where From 6ff2eb5deff1debc7539ab78dfcf73b126b55474 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:14:59 -0700 Subject: [PATCH 78/83] docs(workflows): Improve documentation for side-chain placement workflow with detailed algorithm steps and error handling --- crates/scream-core/src/workflows/place.rs | 29 ++++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/crates/scream-core/src/workflows/place.rs b/crates/scream-core/src/workflows/place.rs index c269f535..1ff2e745 100644 --- a/crates/scream-core/src/workflows/place.rs +++ b/crates/scream-core/src/workflows/place.rs @@ -37,24 +37,35 @@ pub struct PlacementResult { /// Executes the complete side-chain placement workflow. /// -/// This function orchestrates the entire optimization process for protein side-chain -/// conformation prediction, including preparation, energy calculations, clash resolution, -/// optional simulated annealing, and final refinement. +/// This is the primary high-level entry point for running the SCREAM++ algorithm. It orchestrates +/// the entire optimization process, translating a static molecular system and a configuration +/// into a set of optimized, low-energy solutions. +/// +/// The workflow follows the multi-stage algorithm described in the original SCREAM paper: +/// 1. **Preparation**: Loads all necessary resources (forcefield, rotamers) and identifies active residues. +/// 2. **Pre-computation**: Calculates the Empty Lattice (EL) energy for every possible rotamer of every active residue. +/// 3. **Initialization**: Creates an initial "ground state" conformation by selecting the lowest EL-energy rotamer for each residue. +/// 4. **Clash Resolution**: Iteratively identifies and resolves the worst energetic clashes using doublet optimization. +/// 5. **Simulated Annealing (Optional)**: Explores the conformational space to escape local energy minima. +/// 6. **Final Refinement**: Performs final single-residue optimizations to polish the best solutions. /// /// # Arguments /// -/// * `initial_system` - The input molecular system to optimize. -/// * `config` - Configuration parameters for the placement workflow. -/// * `reporter` - Progress reporter for tracking optimization progress. +/// * `initial_system` - A reference to the input `MolecularSystem` with the protein backbone and initial side-chain positions. +/// * `config` - A `PlacementConfig` struct containing all parameters for the run, including forcefield paths, residue selections, and algorithm settings. +/// * `reporter` - A `ProgressReporter` that can be used to receive progress updates for long-running calculations, useful for GUIs or CLIs. /// /// # Return /// -/// Returns a `PlacementResult` containing the initial state and optimized solutions. +/// Returns a `Result` containing a `PlacementResult` on success. The `PlacementResult` holds the +/// initial state of the system and a vector of the best `Solution`s found, sorted by energy. /// /// # Errors /// -/// Returns `EngineError` if any step in the workflow fails due to invalid input, -/// configuration issues, or computational errors. +/// Returns an `EngineError` if any stage of the workflow fails, such as: +/// - `EngineError::LibraryLoad`: If a required rotamer library or parameter file cannot be found or parsed. +/// - `EngineError::ResidueNotFound`: If a residue specified in the config does not exist in the system. +/// - `EngineError::Placement`: If a geometric error occurs during rotamer placement. #[instrument(skip_all, name = "placement_workflow")] pub fn run( initial_system: &MolecularSystem, From a998be96744be760b4e6e73f3a7db82096a73e2f Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:17:39 -0700 Subject: [PATCH 79/83] docs(engine): Enhance documentation for EnergyGrid struct to clarify incremental update model and energy tracking --- crates/scream-core/src/engine/energy_grid.rs | 21 ++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/crates/scream-core/src/engine/energy_grid.rs b/crates/scream-core/src/engine/energy_grid.rs index f5c84772..aa47f82d 100644 --- a/crates/scream-core/src/engine/energy_grid.rs +++ b/crates/scream-core/src/engine/energy_grid.rs @@ -15,13 +15,22 @@ use tracing::{info, trace}; #[cfg(feature = "parallel")] use rayon::prelude::*; -/// Manages the energy landscape for molecular optimization in SCREAM++. +/// Manages the energy landscape for molecular optimization, enabling high-performance incremental updates. /// -/// This struct provides a comprehensive energy calculation framework for protein -/// side-chain placement optimization. It maintains pairwise interaction energies -/// between residues, total interaction energies, and individual residue energies -/// to enable efficient delta energy calculations during optimization moves. -/// The energy grid supports both serial and parallel computation modes. +/// The `EnergyGrid` is a cornerstone of the SCREAM++ engine's performance. A naive optimization +/// approach would recalculate the entire system's energy (an O(N^2) operation for N active residues) +/// after every single rotamer change. This becomes prohibitively expensive. +/// +/// This structure solves that problem by implementing an **incremental update model**. It pre-calculates +/// and stores all pairwise interaction energies. When a single residue's conformation changes, +/// only the O(N) interactions involving that residue need to be recomputed. The `EnergyGrid` +/// can then update the total energy in O(N) time by applying a `MoveDelta`. +/// +/// It tracks: +/// - Pairwise interaction energies between all active residue pairs. +/// - The sum of interaction energies for each individual residue. +/// - The pre-computed Empty Lattice (EL) energy for each residue's current rotamer. +/// - The total optimization score, which is the sum of all interaction and EL energies. #[derive(Debug, Clone)] pub struct EnergyGrid { /// Pairwise interaction energies between residue pairs. From 22b1ccfba20e82b407298654729c16644c63c567 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:19:11 -0700 Subject: [PATCH 80/83] docs(engine): Improve documentation for SystemView struct to clarify temporary modifications and energy evaluation process --- crates/scream-core/src/engine/transaction.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/scream-core/src/engine/transaction.rs b/crates/scream-core/src/engine/transaction.rs index fbb254b6..e306d2f5 100644 --- a/crates/scream-core/src/engine/transaction.rs +++ b/crates/scream-core/src/engine/transaction.rs @@ -5,12 +5,21 @@ use crate::core::models::ids::ResidueId; use crate::core::models::system::MolecularSystem; use std::collections::HashMap; -/// Provides a transactional view of the molecular system for optimization operations. +/// Provides a transactional, temporary view into a `MolecularSystem` for "what-if" calculations. /// -/// This struct enables safe, temporary modifications to the molecular system during -/// optimization moves. It tracks rotamer assignments and provides transaction methods -/// that automatically revert changes after evaluation, ensuring the system remains -/// in a consistent state for energy calculations. +/// A critical challenge in optimization algorithms is evaluating the energy of a proposed +/// move without permanently altering the system state. The naive approach is to `clone()` the +/// entire `MolecularSystem` for each evaluation, which is extremely costly in terms of both +/// memory allocation and CPU time, especially for large proteins. +/// +/// `SystemView` solves this problem by providing a safe, temporary mutable view. The `transaction` +/// and `transaction_doublet` methods work by: +/// 1. Saving the original state (rotamer indices) of the target residue(s). +/// 2. Executing a user-provided closure that is allowed to make temporary modifications. +/// 3. **Guaranteeing** that the original state is restored after the closure completes. +/// +/// This provides the illusion of a temporary copy with virtually zero overhead, dramatically +/// improving performance and reducing memory pressure. pub struct SystemView<'a, 'ctx, C> where C: super::context::ProvidesResidueSelections + Sync, From fd8deb991ac8d881d24a4a8c9b5f2d658c7e72cb Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:19:27 -0700 Subject: [PATCH 81/83] docs(cli): Clarify the importance of downloading data files as a mandatory first step --- docs/cli/USAGE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/cli/USAGE.md b/docs/cli/USAGE.md index 0b1c839c..b1ff4198 100644 --- a/docs/cli/USAGE.md +++ b/docs/cli/USAGE.md @@ -39,6 +39,8 @@ SCREAM++ relies on a set of data files, including forcefield parameters, rotamer ### Downloading Data Files +> **Important:** This is a **mandatory first step**. SCREAM++ cannot run without these data files. (Unless you plan to provide all necessary files manually, which is not recommended for typical users.) + This is the **mandatory first step** for using SCREAM++. Execute the following command to download and automatically unpack all required data files to their default location: ```sh From 842628a6c989d81dbe5abb9e57dfc479481aa8f3 Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:20:11 -0700 Subject: [PATCH 82/83] docs(cli): Clarify note on s-factor optimization and rotamer library diversity requirements --- docs/cli/USAGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cli/USAGE.md b/docs/cli/USAGE.md index b1ff4198..fcf7ce5e 100644 --- a/docs/cli/USAGE.md +++ b/docs/cli/USAGE.md @@ -376,7 +376,7 @@ scream place \ -s 1.2 ``` -> **Note**: The `diversity` (`rmsd-0.6`) of the `rotamer-library` and `delta-params-path` must match. The `s-factor` (`1.2`) is a recommended value optimized for this diversity. +> **Note**: The `s-factor` is empirically optimized for a given rotamer library `diversity` to achieve the best accuracy. It is crucial that the `diversity` of the `--rotamer-library` (e.g., `rmsd-0.6`) matches the `--delta-params-path`. Using mismatched values may lead to suboptimal results. ### Example 3: Optimizing a Ligand Binding Pocket From b980d9aa0eb020ea682cc81343ffaedac110c30b Mon Sep 17 00:00:00 2001 From: Tony Kan Date: Mon, 8 Sep 2025 16:20:22 -0700 Subject: [PATCH 83/83] docs(cli): Update configuration file template for clarity and completeness --- docs/cli/USAGE.md | 140 +++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 71 deletions(-) diff --git a/docs/cli/USAGE.md b/docs/cli/USAGE.md index fcf7ce5e..dfc38642 100644 --- a/docs/cli/USAGE.md +++ b/docs/cli/USAGE.md @@ -185,136 +185,134 @@ Here is an example configuration file with all common options and detailed comme ```toml # ============================================================================= -# SCREAM++ Example Configuration File +# SCREAM++ Complete Configuration File Template # ============================================================================= +# +# This file contains ALL available settings for a `scream place` run. +# - To use a setting, uncomment it and modify its value. +# - Settings that are commented out will use the application's default value, +# which is shown in the comment for reference. +# - For detailed explanations, please refer to the full user manual. # ----------------------------------------------------------------------------- -# [forcefield] - Energy Function & Parameters +# [forcefield] - Energy function and forcefield parameters # ----------------------------------------------------------------------------- [forcefield] -# -- Core Parameters -- +# s-factor for the flat-bottom potential. This is a critical parameter for accuracy. +# Its optimal value is dependent on the diversity of the chosen rotamer library. +# Default: 1.1 (optimized for rmsd-1.0 libraries) +s-factor = 1.1 # Path or logical name for the main forcefield parameter file. # Logical names: 'exp-6@0.4', 'lj-12-6@0.4', etc. # Default: "exp-6@0.4" -forcefield-path = "exp-6@0.4" +# forcefield-path = "exp-6@0.4" -# Path or logical name for the flat-bottom delta parameters. -# The diversity (e.g., "rmsd-1.0") should match your rotamer library. +# Path or logical name for the flat-bottom delta parameter file. +# The diversity (e.g., "rmsd-1.0") should match the rotamer library. # Default: "rmsd-1.0" -delta-params-path = "rmsd-1.0" - -# The 's-factor' for the flat-bottom potential. This is a critical parameter -# that tunes the tolerance for atomic clashes. -# Default: 1.1 -s-factor = 1.1 +# delta-params-path = "rmsd-1.0" -# -- [Optional] Advanced Energy Weighting -- -# -# This section allows you to scale energy terms for interactions between +# [Optional] Rules for applying custom weights to energy components between # different types of atoms (Atom Roles: Backbone, Sidechain, Ligand, Water, Other). -# By default, all weights are 1.0. -# +# By default, all interactions have a weight of 1.0. # [[forcefield.energy-weights.rules]] # groups = ["Backbone", "Sidechain"] -# weights = { vdw = 0.8, coulomb = 0.8, hbond = 1.0 } -# -# [[forcefield.energy-weights.rules]] -# groups = ["Sidechain", "Ligand"] -# weights = { vdw = 0.5, coulomb = 1.0, hbond = 1.2 } - +# weights = { vdw = 1.0, coulomb = 1.0, hbond = 1.0 } # ----------------------------------------------------------------------------- -# [sampling] - Conformational Sampling +# [sampling] - Side-chain conformation sampling # ----------------------------------------------------------------------------- [sampling] # Path or logical name for the rotamer library. -# The diversity (e.g., "rmsd-1.0") should match your delta-params-path. +# The diversity (e.g., "rmsd-1.0") should match `delta-params-path`. # Logical names: 'charmm@rmsd-1.0', 'amber@rmsd-1.0', etc. # Default: "charmm@rmsd-1.0" -rotamer-library = "charmm@rmsd-1.0" - +# rotamer-library = "charmm@rmsd-1.0" # ----------------------------------------------------------------------------- -# [optimization] - Algorithm Control +# [optimization] - Algorithm control # ----------------------------------------------------------------------------- [optimization] -# Number of lowest-energy, unique solutions to generate and save. +# The number of lowest-energy, unique solutions to generate and save. # Default: 1 num-solutions = 1 -# Maximum number of iterations for the primary clash-resolution loop. +# Maximum number of iterations for the main clash resolution algorithm. # Default: 100 -max-iterations = 100 +# max-iterations = 100 -# If true, the original side-chain conformation from the input structure -# will be included as a candidate during the optimization. +# Whether to include the input structure's original side-chain conformation as +# a candidate solution during the optimization. # Default: true -include-input-conformation = true +# include-input-conformation = true -# Number of refinement iterations (singlet optimization) to perform after the -# main clash-resolution loop has converged. Set to 0 to disable. +# Number of refinement iterations (singlet optimization) to run after the +# main loop converges. Set to 0 to disable. # Default: 2 -final-refinement-iterations = 2 +# final-refinement-iterations = 2 -# -- [Optional] Simulated Annealing -- -# -# To enable simulated annealing for better global energy landscape exploration, -# uncomment this entire section. This may improve results but will increase runtime. -# -# [optimization.simulated-annealing] -# initial-temperature = 5.0 # Starting temperature (in energy units). -# final-temperature = 0.1 # Temperature at which to stop the annealing. -# cooling-rate = 0.9 # Multiplicative factor to decrease temperature (e.g., T_new = T_old * 0.9). -# steps-per-temperature = 100 # Number of Monte Carlo moves to attempt at each temperature step. +# [optimization.convergence] +# --- Convergence Criteria --- +# The algorithm is considered converged if the best energy improves by less than +# this threshold over a 'patience' number of iterations. +# Default: 0.01 (kcal/mol) +# energy-threshold = 0.01 -# -- Convergence Criteria -- -# -# Defines the conditions for stopping the clash-resolution loop. -# -[optimization.convergence] -# The loop will stop if the best energy found does not improve by at least -# this amount (in kcal/mol) over a 'patience' number of iterations. -# Default: 0.01 -energy-threshold = 0.01 - -# The number of consecutive iterations without sufficient energy improvement -# before the algorithm is considered to have converged. +# The number of consecutive iterations with insufficient energy improvement +# before the optimization loop terminates. # Default: 5 -patience-iterations = 5 +# patience-iterations = 5 +# [optimization.simulated-annealing] +# --- [Optional] Simulated Annealing --- +# To enable, uncomment this entire section. This can help the algorithm escape +# local energy minima but will increase runtime. +# initial-temperature = 5.0 +# final-temperature = 0.1 +# cooling-rate = 0.9 +# steps-per-temperature = 100 # ----------------------------------------------------------------------------- -# [residues-to-optimize] - Defines the Scope of Optimization +# [residues-to-optimize] - Defines which residues to modify # ----------------------------------------------------------------------------- [residues-to-optimize] -# TYPE 1: Optimize all residues in the protein. +# Choose ONE of the following types: "all", "list", or "ligand-binding-site". + +# TYPE 1: Optimize all residues. +# This is the default if the section is omitted. type = "all" # TYPE 2: Optimize a specific list of residues. +# `include` specifies which residues to target. If `include` is empty, it defaults to all residues. +# `exclude` specifies which residues to ignore, even if they are in the `include` selection. # type = "list" -# # 'include' defines a whitelist. If 'include' is empty, all residues are selected. # include = [ # { chain-id = 'A', residue-number = 25 }, # { chain-id = 'A', residue-number = 101 }, # ] -# # 'exclude' defines a blacklist that overrides the selection. -# exclude = [ -# { chain-id = 'A', residue-number = 50 }, -# ] +# exclude = [] -# TYPE 3: Optimize residues within a radius of a ligand. +# TYPE 3: Optimize residues within a certain radius of a ligand. +# The radius is measured from any heavy atom of the ligand to any heavy atom of a protein residue. # type = "ligand-binding-site" -# # Specify the ligand's location. +# radius-angstroms = 5.0 # [residues-to-optimize.ligand-residue] # chain-id = 'X' # residue-number = 999 -# # Define the radius in Angstroms from any heavy atom of the ligand. -# radius-angstroms = 5.0 + +# ----------------------------------------------------------------------------- +# Global Settings +# ----------------------------------------------------------------------------- + +# Path or logical name for the residue topology registry. +# In most cases, the default is sufficient. +# Default: "default" +# topology-registry-path = "default" ``` ### Detailed Configuration Options