Source code for lcmd_db.dataset._molecules
"""MoleculeDataset — one row per Molecule entry."""
from __future__ import annotations
from pathlib import Path
from typing import Generic
from ..types import EntityMetadata, Molecule, Properties
from ._base import Dataset, _extract_properties
from ._source import DataSource
[docs]
class MoleculeDataset(Dataset[Molecule[Properties]], Generic[Properties]): # noqa: UP046
def __init__(
self,
source: DataSource,
*,
structures_dir: Path | None = None,
metadata: EntityMetadata | None = None,
) -> None:
super().__init__(source, metadata=metadata)
self._structures_dir = structures_dir
def _resolve_entry(self, row: dict[str, object]) -> Molecule[Properties]:
mol_id = int(row["id"]) # type: ignore[arg-type]
return Molecule(
id=mol_id,
properties=_extract_properties(row), # type: ignore[arg-type]
structure_path=resolve_structure(self._structures_dir, mol_id),
)
def _with_source(self, source: DataSource) -> MoleculeDataset[Properties]:
return MoleculeDataset(
source, structures_dir=self._structures_dir, metadata=self._metadata
)
def resolve_structure(structures_dir: Path | None, mol_id: int) -> Path | None:
if structures_dir is None:
return None
path = structures_dir / f"{mol_id}.xyz"
return path if path.exists() else None