Source code for lcmd_db.types

"""Type definitions for LCMD-DB client."""

from __future__ import annotations

import dataclasses
import sys
from collections.abc import Mapping
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Literal, TypeAlias, overload

if sys.version_info >= (3, 11):  # noqa: UP036 — StrEnum needs backport on 3.10
    from enum import StrEnum
else:
    from enum import Enum as _Enum

    class StrEnum(str, _Enum):  # noqa: UP042
        pass


if sys.version_info >= (3, 13):  # noqa: UP036 — TypeVar(default=) needs typing_extensions on <3.13
    from typing import TypeVar
else:
    from typing_extensions import TypeVar  # noqa: UP035

import polars as pl
from pydantic import BaseModel, field_validator

if TYPE_CHECKING:
    from .dataset._base import Dataset
    from .dataset._fragments import FragmentDataset
    from .dataset._molecules import MoleculeDataset
    from .dataset._reactions import ReactionDataset

Value: TypeAlias = str | int | float | bool | None  # noqa: UP040

Properties = TypeVar(
    "Properties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

MProperties = TypeVar(
    "MProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

RProperties = TypeVar(
    "RProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

FProperties = TypeVar(
    "FProperties",
    bound=Mapping[str, object],
    default=dict[str, Value],
)

FType = TypeVar("FType", default=str | None)

ATemplates = TypeVar("ATemplates", default=dict[str, object])



[docs]
class DataFormat(StrEnum):
    CSV = "csv"
    TSV = "tsv"
    XLSX = "xlsx"
    PARQUET = "parquet"
    JSON = "json"




[docs]
class EntityType(StrEnum):
    MOLECULES = "molecules"
    REACTIONS = "reactions"
    FRAGMENTS = "fragments"




[docs]
class PropertyDataType(StrEnum):
    FLOAT = "float"
    INTEGER = "integer"
    STRING = "string"
    BOOLEAN = "boolean"



Include = Literal["molecules", "reactions", "fragments", "structures"]

ENTITY_TYPES: tuple[str, ...] = ("molecules", "reactions", "fragments")



[docs]
class PropertyInfo(BaseModel, frozen=True):
    slug: str
    name: str = ""
    data_type: PropertyDataType = PropertyDataType.STRING
    units: str | None = None
    description: str | None = None
    is_intrinsic: bool = False
    is_required: bool | None = None

    @field_validator("data_type", mode="before")
    @classmethod
    def _coerce_data_type(cls, v: object) -> PropertyDataType:
        if isinstance(v, PropertyDataType):
            return v
        try:
            return PropertyDataType(str(v))
        except ValueError:
            return PropertyDataType.STRING




[docs]
class EntityMetadata(BaseModel, frozen=True):
    count: int = 0
    columns: int = 0
    properties: list[PropertyInfo] = []




[docs]
class SlotInfo(BaseModel, frozen=True):
    """Metadata for a single assembly template slot."""

    id: str
    fragment_type: str
    description: str = ""
    required: bool = True
    default: str | None = None




[docs]
class AssemblyTemplateInfo(BaseModel, frozen=True):
    """Metadata for an assembly template, read from metadata.json."""

    slug: str
    name: str = ""
    description: str = ""
    assembler_name: str = ""
    slots: list[SlotInfo] = []




[docs]
class SubsetMetadata(BaseModel, frozen=True):
    subset: str
    subset_name: str = ""
    description: str = ""
    data_format: str = ""
    entities: dict[str, EntityMetadata] = {}
    structures_count: int | None = None
    client_version: str | None = None
    assembly_templates: list[AssemblyTemplateInfo] = []




[docs]
@dataclasses.dataclass(frozen=True)
class AssemblyResult:
    """Result of a single assembly in a batch.

    For single assembly, use ``template.assemble()`` which returns
    the SMILES string directly and raises on failure.
    """

    smiles: str | None = None
    error: str | None = None

    @property
    def success(self) -> bool:
        return self.smiles is not None and self.error is None




[docs]
@dataclasses.dataclass(frozen=True)
class Molecule(Generic[Properties]):
    """A molecule entry with typed properties and optional structure file.

    Access properties via subscript: ``mol.properties["energy"]``.
    Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
    """

    id: int
    properties: Properties
    structure_path: Path | None = None




[docs]
class ParticipantRole(StrEnum):
    """Role of a molecule participating in a reaction."""

    REACTANT = "reactant"
    PRODUCT = "product"
    CATALYST = "catalyst"
    CO_CATALYST = "co_catalyst"
    SUBSTRATE = "substrate"
    INTERMEDIATE = "intermediate"
    TRANSITION_STATE = "transition_state"
    SOLVENT = "solvent"
    ADDITIVE = "additive"




[docs]
@dataclasses.dataclass(frozen=True)
class Participant:
    """A molecule's participation in a reaction step.

    Attributes:
        molecule (Molecule[dict[str, Value]]): The participating molecule.
        role (ParticipantRole): Role in the reaction (reactant, product, catalyst, ...).
        step_from (float | None): Starting step index, if applicable.
        step_to (float | None): Ending step index, if applicable.
        label (str): Optional human-readable label.
    """

    molecule: Molecule[dict[str, Value]]
    role: ParticipantRole
    step_from: float | None = None
    step_to: float | None = None
    label: str = ""




[docs]
@dataclasses.dataclass(frozen=True)
class Reaction(Generic[Properties]):
    """A reaction entry with typed properties and participant molecules.

    Access properties via subscript: ``rxn.properties["barrier"]``.
    Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
    """

    id: int
    properties: Properties
    participants: list[Participant] = dataclasses.field(default_factory=list)




[docs]
@dataclasses.dataclass(frozen=True)
class Fragment(Generic[Properties, FType]):
    """A molecular fragment entry with typed properties.

    Access properties via subscript: ``frag.properties["charge"]``.
    Per-subset type stubs narrow property types; see ``lcmd-db stubs sync``.
    """

    id: int
    properties: Properties
    fragment_type: FType = None  # pyright: ignore[reportAssignmentType]




[docs]
@dataclasses.dataclass
class SubsetData(Generic[MProperties, RProperties, FProperties, FType, ATemplates]):
    """Container for data returned by a subset download.

    Generic over molecule, reaction, and fragment property types.  When
    returned by a generated ``load_dataset`` overload the type parameters
    are bound to per-subset ``TypedDict`` classes so that
    ``data.as_dataset("molecules")[0].properties["energy"]`` is fully typed.

    Without explicit type parameters, defaults to ``dict[str, Value]`` for
    all three (backward-compatible).
    """

    molecules: pl.DataFrame | None = None
    reactions: pl.DataFrame | None = None
    fragments: pl.DataFrame | None = None
    metadata: SubsetMetadata | None = None
    _data_dir: Path | None = dataclasses.field(default=None, repr=False, init=False)
    _data_format: DataFormat = dataclasses.field(
        default=DataFormat.PARQUET, repr=False, init=False
    )
    _assembly_cache: dict[str, object] | None = dataclasses.field(
        default=None, repr=False, init=False
    )

    @classmethod
    def _from_download(
        cls,
        *,
        data_dir: Path,
        data_format: DataFormat,
        molecules: pl.DataFrame | None = None,
        reactions: pl.DataFrame | None = None,
        fragments: pl.DataFrame | None = None,
        metadata: SubsetMetadata | None = None,
    ) -> SubsetData:
        """Internal factory — sets the private download context fields."""
        instance = cls(
            molecules=molecules,
            reactions=reactions,
            fragments=fragments,
            metadata=metadata,
        )
        instance._data_dir = data_dir
        instance._data_format = data_format
        return instance

    @property
    def dataframes(self) -> dict[str, pl.DataFrame]:
        """Return a mapping of entity type names to their non-null DataFrames."""
        return {
            name: df for name in ENTITY_TYPES if (df := getattr(self, name)) is not None
        }

    @overload
    def as_dataset(
        self, entity_type: Literal["molecules"]
    ) -> MoleculeDataset[MProperties]: ...

    @overload
    def as_dataset(
        self, entity_type: Literal["reactions"]
    ) -> ReactionDataset[RProperties]: ...

    @overload
    def as_dataset(
        self, entity_type: Literal["fragments"]
    ) -> FragmentDataset[FProperties, FType]: ...


[docs]
    def as_dataset(
        self, entity_type: str
    ) -> (
        Dataset[Molecule[MProperties]]
        | Dataset[Reaction[RProperties]]
        | Dataset[Fragment[FProperties, FType]]
    ):
        """Build a typed :class:`~lcmd_db.Dataset` for the given entity type.

        Args:
            entity_type: One of ``"molecules"``, ``"reactions"``, or ``"fragments"``.

        Returns:
            A :class:`MoleculeDataset`, :class:`ReactionDataset`, or
            :class:`FragmentDataset` depending on the entity type.

        Raises:
            ValueError: If no data directory is available or the entity type is unknown.

        Example::

            molecules = data.as_dataset("molecules")   # MoleculeDataset
            reactions = data.as_dataset("reactions")    # ReactionDataset
        """
        from .dataset._factory import build_dataset

        if self._data_dir is None:
            raise ValueError(
                "No data directory available — dataset was not loaded from cache"
            )

        return build_dataset(
            entity_type,
            data_dir=self._data_dir,
            data_format=self._data_format,
            metadata=self.metadata,
        )


    @property
    def assembly_templates(self) -> ATemplates:
        """Assembly templates available for this subset.

        Each template can assemble fragment SMILES into a molecule
        via ``template.assemble(core="...", sub1="...")``.
        """
        if self._assembly_cache is not None:
            return self._assembly_cache  # pyright: ignore[reportReturnType]

        from ._assembly import AssemblyTemplate

        if self._data_dir is None or self.metadata is None:
            return {}  # pyright: ignore[reportReturnType]
        assembly_dir = self._data_dir / "assembly"
        result = {
            info.slug: AssemblyTemplate(info, assembly_dir / f"{info.slug}.py")
            for info in self.metadata.assembly_templates
            if (assembly_dir / f"{info.slug}.py").exists()
        }
        self._assembly_cache = result
        return result  # pyright: ignore[reportReturnType]