Source code for lcmd_db.types

"""Type definitions for LCMD-DB client."""

from __future__ import annotations

import dataclasses
import sys
from collections.abc import Mapping
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Literal, TypeAlias, overload

if sys.version_info >= (3, 11):  # noqa: UP036 — StrEnum needs backport on 3.10
    from enum import StrEnum
else:
    from enum import Enum as _Enum

    class StrEnum(str, _Enum):  # noqa: UP042
        pass


if sys.version_info >= (3, 13):  # noqa: UP036 — TypeVar(default=) needs typing_extensions on <3.13
    from typing import TypeVar
else:
    from typing_extensions import TypeVar  # noqa: UP035

import polars as pl
from pydantic import BaseModel, field_validator

if TYPE_CHECKING:
    from .dataset._base import Dataset
    from .dataset._fragments import FragmentDataset
    from .dataset._molecules import MoleculeDataset
    from .dataset._reactions import ReactionDataset

Value: TypeAlias = str | int | float | bool | None  # noqa: UP040

Properties = TypeVar(
    "Properties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

MProperties = TypeVar(
    "MProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

RProperties = TypeVar(
    "RProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

FProperties = TypeVar(
    "FProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

FType = TypeVar("FType", default=str | None)

ATemplates = TypeVar("ATemplates", default=dict[str, object])


[docs] class DataFormat(StrEnum): CSV = "csv" TSV = "tsv" XLSX = "xlsx" PARQUET = "parquet" JSON = "json"
[docs] class EntityType(StrEnum): MOLECULES = "molecules" REACTIONS = "reactions" FRAGMENTS = "fragments"
[docs] class PropertyDataType(StrEnum): FLOAT = "float" INTEGER = "integer" STRING = "string" BOOLEAN = "boolean"
Include = Literal["molecules", "reactions", "fragments", "structures"] ENTITY_TYPES: tuple[str, ...] = ("molecules", "reactions", "fragments")
[docs] class PropertyInfo(BaseModel, frozen=True): slug: str name: str = "" data_type: PropertyDataType = PropertyDataType.STRING units: str | None = None description: str | None = None is_intrinsic: bool = False is_required: bool | None = None @field_validator("data_type", mode="before") @classmethod def _coerce_data_type(cls, v: object) -> PropertyDataType: if isinstance(v, PropertyDataType): return v try: return PropertyDataType(str(v)) except ValueError: return PropertyDataType.STRING
[docs] class EntityMetadata(BaseModel, frozen=True): count: int = 0 columns: int = 0 properties: list[PropertyInfo] = []
[docs] class SlotInfo(BaseModel, frozen=True): """Metadata for a single assembly template slot.""" id: str fragment_type: str description: str = "" required: bool = True default: str | None = None
[docs] class AssemblyTemplateInfo(BaseModel, frozen=True): """Metadata for an assembly template, read from metadata.json.""" slug: str name: str = "" description: str = "" assembler_name: str = "" slots: list[SlotInfo] = []
[docs] class SubsetMetadata(BaseModel, frozen=True): subset: str subset_name: str = "" description: str = "" data_format: str = "" entities: dict[str, EntityMetadata] = {} structures_count: int | None = None client_version: str | None = None assembly_templates: list[AssemblyTemplateInfo] = []
[docs] @dataclasses.dataclass(frozen=True) class AssemblyResult: """Result of a single assembly in a batch. For single assembly, use ``template.assemble()`` which returns the SMILES string directly and raises on failure. """ smiles: str | None = None error: str | None = None @property def success(self) -> bool: return self.smiles is not None and self.error is None
[docs] @dataclasses.dataclass(frozen=True) class Molecule(Generic[Properties]): """A molecule entry with typed properties and optional structure file. Access properties via subscript: ``mol.properties["energy"]``. Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``. """ id: int properties: Properties structure_path: Path | None = None
[docs] class ParticipantRole(StrEnum): """Role of a molecule participating in a reaction.""" REACTANT = "reactant" PRODUCT = "product" CATALYST = "catalyst" CO_CATALYST = "co_catalyst" SUBSTRATE = "substrate" INTERMEDIATE = "intermediate" TRANSITION_STATE = "transition_state" SOLVENT = "solvent" ADDITIVE = "additive"
[docs] @dataclasses.dataclass(frozen=True) class Participant: """A molecule's participation in a reaction step. Attributes: molecule (Molecule[dict[str, Value]]): The participating molecule. role (ParticipantRole): Role in the reaction (reactant, product, catalyst, ...). step_from (float | None): Starting step index, if applicable. step_to (float | None): Ending step index, if applicable. label (str): Optional human-readable label. """ molecule: Molecule[dict[str, Value]] role: ParticipantRole step_from: float | None = None step_to: float | None = None label: str = ""
[docs] @dataclasses.dataclass(frozen=True) class Reaction(Generic[Properties]): """A reaction entry with typed properties and participant molecules. Access properties via subscript: ``rxn.properties["barrier"]``. Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``. """ id: int properties: Properties participants: list[Participant] = dataclasses.field(default_factory=list)
[docs] @dataclasses.dataclass(frozen=True) class Fragment(Generic[Properties, FType]): """A molecular fragment entry with typed properties. Access properties via subscript: ``frag.properties["charge"]``. Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``. """ id: int properties: Properties fragment_type: FType = None # pyright: ignore[reportAssignmentType]
[docs] @dataclasses.dataclass class SubsetData(Generic[MProperties, RProperties, FProperties, FType, ATemplates]): """Container for data returned by a subset download. Generic over molecule, reaction, and fragment property types. When returned by a generated ``load_dataset`` overload the type parameters are bound to per-subset ``TypedDict`` classes so that ``data.as_dataset("molecules")[0].properties["energy"]`` is fully typed. Without explicit type parameters, defaults to ``dict[str, Value]`` for all three (backward-compatible). """ molecules: pl.DataFrame | None = None reactions: pl.DataFrame | None = None fragments: pl.DataFrame | None = None metadata: SubsetMetadata | None = None _data_dir: Path | None = dataclasses.field(default=None, repr=False, init=False) _data_format: DataFormat = dataclasses.field( default=DataFormat.PARQUET, repr=False, init=False ) _assembly_cache: dict[str, object] | None = dataclasses.field( default=None, repr=False, init=False ) @classmethod def _from_download( cls, *, data_dir: Path, data_format: DataFormat, molecules: pl.DataFrame | None = None, reactions: pl.DataFrame | None = None, fragments: pl.DataFrame | None = None, metadata: SubsetMetadata | None = None, ) -> SubsetData: """Internal factory — sets the private download context fields.""" instance = cls( molecules=molecules, reactions=reactions, fragments=fragments, metadata=metadata, ) instance._data_dir = data_dir instance._data_format = data_format return instance @property def dataframes(self) -> dict[str, pl.DataFrame]: """Return a mapping of entity type names to their non-null DataFrames.""" return { name: df for name in ENTITY_TYPES if (df := getattr(self, name)) is not None } @overload def as_dataset( self, entity_type: Literal["molecules"] ) -> MoleculeDataset[MProperties]: ... @overload def as_dataset( self, entity_type: Literal["reactions"] ) -> ReactionDataset[RProperties]: ... @overload def as_dataset( self, entity_type: Literal["fragments"] ) -> FragmentDataset[FProperties, FType]: ...
[docs] def as_dataset( self, entity_type: str ) -> ( Dataset[Molecule[MProperties]] | Dataset[Reaction[RProperties]] | Dataset[Fragment[FProperties, FType]] ): """Build a typed :class:`~lcmd_db.Dataset` for the given entity type. Args: entity_type: One of ``"molecules"``, ``"reactions"``, or ``"fragments"``. Returns: A :class:`MoleculeDataset`, :class:`ReactionDataset`, or :class:`FragmentDataset` depending on the entity type. Raises: ValueError: If no data directory is available or the entity type is unknown. Example:: molecules = data.as_dataset("molecules") # MoleculeDataset reactions = data.as_dataset("reactions") # ReactionDataset """ from .dataset._factory import build_dataset if self._data_dir is None: raise ValueError( "No data directory available — dataset was not loaded from cache" ) return build_dataset( entity_type, data_dir=self._data_dir, data_format=self._data_format, metadata=self.metadata, )
@property def assembly_templates(self) -> ATemplates: """Assembly templates available for this subset. Each template can assemble fragment SMILES into a molecule via ``template.assemble(core="...", sub1="...")``. """ if self._assembly_cache is not None: return self._assembly_cache # pyright: ignore[reportReturnType] from ._assembly import AssemblyTemplate if self._data_dir is None or self.metadata is None: return {} # pyright: ignore[reportReturnType] assembly_dir = self._data_dir / "assembly" result = { info.slug: AssemblyTemplate(info, assembly_dir / f"{info.slug}.py") for info in self.metadata.assembly_templates if (assembly_dir / f"{info.slug}.py").exists() } self._assembly_cache = result return result # pyright: ignore[reportReturnType]