"""Type definitions for LCMD-DB client."""
from __future__ import annotations
import dataclasses
import sys
from collections.abc import Mapping
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Literal, TypeAlias, overload
if sys.version_info >= (3, 11): # noqa: UP036 — StrEnum needs backport on 3.10
from enum import StrEnum
else:
from enum import Enum as _Enum
class StrEnum(str, _Enum): # noqa: UP042
pass
if sys.version_info >= (3, 13): # noqa: UP036 — TypeVar(default=) needs typing_extensions on <3.13
from typing import TypeVar
else:
from typing_extensions import TypeVar # noqa: UP035
import polars as pl
from pydantic import BaseModel, field_validator
if TYPE_CHECKING:
from .dataset._base import Dataset
from .dataset._fragments import FragmentDataset
from .dataset._molecules import MoleculeDataset
from .dataset._reactions import ReactionDataset
Value: TypeAlias = str | int | float | bool | None # noqa: UP040
Properties = TypeVar(
"Properties",
bound=Mapping[str, object],
default=dict[str, Value],
)
MProperties = TypeVar(
"MProperties",
bound=Mapping[str, object],
default=dict[str, Value],
)
RProperties = TypeVar(
"RProperties",
bound=Mapping[str, object],
default=dict[str, Value],
)
FProperties = TypeVar(
"FProperties",
bound=Mapping[str, object],
default=dict[str, Value],
)
FType = TypeVar("FType", default=str | None)
ATemplates = TypeVar("ATemplates", default=dict[str, object])
[docs]
class EntityType(StrEnum):
MOLECULES = "molecules"
REACTIONS = "reactions"
FRAGMENTS = "fragments"
[docs]
class PropertyDataType(StrEnum):
FLOAT = "float"
INTEGER = "integer"
STRING = "string"
BOOLEAN = "boolean"
Include = Literal["molecules", "reactions", "fragments", "structures"]
ENTITY_TYPES: tuple[str, ...] = ("molecules", "reactions", "fragments")
[docs]
class PropertyInfo(BaseModel, frozen=True):
slug: str
name: str = ""
data_type: PropertyDataType = PropertyDataType.STRING
units: str | None = None
description: str | None = None
is_intrinsic: bool = False
is_required: bool | None = None
@field_validator("data_type", mode="before")
@classmethod
def _coerce_data_type(cls, v: object) -> PropertyDataType:
if isinstance(v, PropertyDataType):
return v
try:
return PropertyDataType(str(v))
except ValueError:
return PropertyDataType.STRING
[docs]
class SlotInfo(BaseModel, frozen=True):
"""Metadata for a single assembly template slot."""
id: str
fragment_type: str
description: str = ""
required: bool = True
default: str | None = None
[docs]
class AssemblyTemplateInfo(BaseModel, frozen=True):
"""Metadata for an assembly template, read from metadata.json."""
slug: str
name: str = ""
description: str = ""
assembler_name: str = ""
slots: list[SlotInfo] = []
[docs]
@dataclasses.dataclass(frozen=True)
class AssemblyResult:
"""Result of a single assembly in a batch.
For single assembly, use ``template.assemble()`` which returns
the SMILES string directly and raises on failure.
"""
smiles: str | None = None
error: str | None = None
@property
def success(self) -> bool:
return self.smiles is not None and self.error is None
[docs]
@dataclasses.dataclass(frozen=True)
class Molecule(Generic[Properties]):
"""A molecule entry with typed properties and optional structure file.
Access properties via subscript: ``mol.properties["energy"]``.
Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
"""
id: int
properties: Properties
structure_path: Path | None = None
[docs]
class ParticipantRole(StrEnum):
"""Role of a molecule participating in a reaction."""
REACTANT = "reactant"
PRODUCT = "product"
CATALYST = "catalyst"
CO_CATALYST = "co_catalyst"
SUBSTRATE = "substrate"
INTERMEDIATE = "intermediate"
TRANSITION_STATE = "transition_state"
SOLVENT = "solvent"
ADDITIVE = "additive"
[docs]
@dataclasses.dataclass(frozen=True)
class Participant:
"""A molecule's participation in a reaction step.
Attributes:
molecule (Molecule[dict[str, Value]]): The participating molecule.
role (ParticipantRole): Role in the reaction (reactant, product, catalyst, ...).
step_from (float | None): Starting step index, if applicable.
step_to (float | None): Ending step index, if applicable.
label (str): Optional human-readable label.
"""
molecule: Molecule[dict[str, Value]]
role: ParticipantRole
step_from: float | None = None
step_to: float | None = None
label: str = ""
[docs]
@dataclasses.dataclass(frozen=True)
class Reaction(Generic[Properties]):
"""A reaction entry with typed properties and participant molecules.
Access properties via subscript: ``rxn.properties["barrier"]``.
Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
"""
id: int
properties: Properties
participants: list[Participant] = dataclasses.field(default_factory=list)
[docs]
@dataclasses.dataclass(frozen=True)
class Fragment(Generic[Properties, FType]):
"""A molecular fragment entry with typed properties.
Access properties via subscript: ``frag.properties["charge"]``.
Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
"""
id: int
properties: Properties
fragment_type: FType = None # pyright: ignore[reportAssignmentType]
[docs]
@dataclasses.dataclass
class SubsetData(Generic[MProperties, RProperties, FProperties, FType, ATemplates]):
"""Container for data returned by a subset download.
Generic over molecule, reaction, and fragment property types. When
returned by a generated ``load_dataset`` overload the type parameters
are bound to per-subset ``TypedDict`` classes so that
``data.as_dataset("molecules")[0].properties["energy"]`` is fully typed.
Without explicit type parameters, defaults to ``dict[str, Value]`` for
all three (backward-compatible).
"""
molecules: pl.DataFrame | None = None
reactions: pl.DataFrame | None = None
fragments: pl.DataFrame | None = None
metadata: SubsetMetadata | None = None
_data_dir: Path | None = dataclasses.field(default=None, repr=False, init=False)
_data_format: DataFormat = dataclasses.field(
default=DataFormat.PARQUET, repr=False, init=False
)
_assembly_cache: dict[str, object] | None = dataclasses.field(
default=None, repr=False, init=False
)
@classmethod
def _from_download(
cls,
*,
data_dir: Path,
data_format: DataFormat,
molecules: pl.DataFrame | None = None,
reactions: pl.DataFrame | None = None,
fragments: pl.DataFrame | None = None,
metadata: SubsetMetadata | None = None,
) -> SubsetData:
"""Internal factory — sets the private download context fields."""
instance = cls(
molecules=molecules,
reactions=reactions,
fragments=fragments,
metadata=metadata,
)
instance._data_dir = data_dir
instance._data_format = data_format
return instance
@property
def dataframes(self) -> dict[str, pl.DataFrame]:
"""Return a mapping of entity type names to their non-null DataFrames."""
return {
name: df for name in ENTITY_TYPES if (df := getattr(self, name)) is not None
}
@overload
def as_dataset(
self, entity_type: Literal["molecules"]
) -> MoleculeDataset[MProperties]: ...
@overload
def as_dataset(
self, entity_type: Literal["reactions"]
) -> ReactionDataset[RProperties]: ...
@overload
def as_dataset(
self, entity_type: Literal["fragments"]
) -> FragmentDataset[FProperties, FType]: ...
[docs]
def as_dataset(
self, entity_type: str
) -> (
Dataset[Molecule[MProperties]]
| Dataset[Reaction[RProperties]]
| Dataset[Fragment[FProperties, FType]]
):
"""Build a typed :class:`~lcmd_db.Dataset` for the given entity type.
Args:
entity_type: One of ``"molecules"``, ``"reactions"``, or ``"fragments"``.
Returns:
A :class:`MoleculeDataset`, :class:`ReactionDataset`, or
:class:`FragmentDataset` depending on the entity type.
Raises:
ValueError: If no data directory is available or the entity type is unknown.
Example::
molecules = data.as_dataset("molecules") # MoleculeDataset
reactions = data.as_dataset("reactions") # ReactionDataset
"""
from .dataset._factory import build_dataset
if self._data_dir is None:
raise ValueError(
"No data directory available — dataset was not loaded from cache"
)
return build_dataset(
entity_type,
data_dir=self._data_dir,
data_format=self._data_format,
metadata=self.metadata,
)
@property
def assembly_templates(self) -> ATemplates:
"""Assembly templates available for this subset.
Each template can assemble fragment SMILES into a molecule
via ``template.assemble(core="...", sub1="...")``.
"""
if self._assembly_cache is not None:
return self._assembly_cache # pyright: ignore[reportReturnType]
from ._assembly import AssemblyTemplate
if self._data_dir is None or self.metadata is None:
return {} # pyright: ignore[reportReturnType]
assembly_dir = self._data_dir / "assembly"
result = {
info.slug: AssemblyTemplate(info, assembly_dir / f"{info.slug}.py")
for info in self.metadata.assembly_templates
if (assembly_dir / f"{info.slug}.py").exists()
}
self._assembly_cache = result
return result # pyright: ignore[reportReturnType]