Source code for lcmd_db.dataset._molecules

"""MoleculeDataset — one row per Molecule entry."""

from __future__ import annotations

from pathlib import Path
from typing import Generic

from ..types import EntityMetadata, Molecule, Properties
from ._base import Dataset, _extract_properties
from ._source import DataSource


[docs] class MoleculeDataset(Dataset[Molecule[Properties]], Generic[Properties]): # noqa: UP046 def __init__( self, source: DataSource, *, structures_dir: Path | None = None, metadata: EntityMetadata | None = None, ) -> None: super().__init__(source, metadata=metadata) self._structures_dir = structures_dir def _resolve_entry(self, row: dict[str, object]) -> Molecule[Properties]: mol_id = int(row["id"]) # type: ignore[arg-type] return Molecule( id=mol_id, properties=_extract_properties(row), # type: ignore[arg-type] structure_path=resolve_structure(self._structures_dir, mol_id), ) def _with_source(self, source: DataSource) -> MoleculeDataset[Properties]: return MoleculeDataset( source, structures_dir=self._structures_dir, metadata=self._metadata )
def resolve_structure(structures_dir: Path | None, mol_id: int) -> Path | None: if structures_dir is None: return None path = structures_dir / f"{mol_id}.xyz" return path if path.exists() else None