| from __future__ import annotations |
|
|
| import pandas as pd |
| import streamlit as st |
| from rdkit import Chem |
| from rdkit import RDLogger |
|
|
| RDLogger.DisableLog("rdApp.*") |
|
|
| |
| |
| |
| SOURCES = ["EXP", "MD", "DFT", "GC"] |
|
|
| SOURCE_LABELS = { |
| "EXP": "Experimental", |
| "MD": "Molecular Dynamics", |
| "DFT": "Density Functional Theory", |
| "GC": "Group Contribution", |
| } |
|
|
| |
| |
| |
| POLYINFO_FILE = "data/POLYINFO.csv" |
|
|
|
|
| def canonicalize_smiles(smiles: str) -> str | None: |
| smiles = (smiles or "").strip() |
| if not smiles: |
| return None |
| mol = Chem.MolFromSmiles(smiles) |
| if mol is None: |
| return None |
| return Chem.MolToSmiles(mol, canonical=True) |
|
|
|
|
| |
| PROPERTY_META = { |
| |
| "tm": {"name": "Melting temperature", "unit": "K"}, |
| "tg": {"name": "Glass transition temperature", "unit": "K"}, |
| "td": {"name": "Thermal diffusivity", "unit": "m^2/s"}, |
| "tc": {"name": "Thermal conductivity", "unit": "W/m路K"}, |
| "cp": {"name": "Specific heat capacity", "unit": "J/kg路K"}, |
| |
| "young": {"name": "Young's modulus", "unit": "GPa"}, |
| "shear": {"name": "Shear modulus", "unit": "GPa"}, |
| "bulk": {"name": "Bulk modulus", "unit": "GPa"}, |
| "poisson": {"name": "Poisson ratio", "unit": "-"}, |
| |
| "visc": {"name": "Viscosity", "unit": "Pa路s"}, |
| "dif": {"name": "Diffusivity", "unit": "cm^2/s"}, |
| |
| "phe": {"name": "He permeability", "unit": "Barrer"}, |
| "ph2": {"name": "H2 permeability", "unit": "Barrer"}, |
| "pco2": {"name": "CO2 permeability", "unit": "Barrer"}, |
| "pn2": {"name": "N2 permeability", "unit": "Barrer"}, |
| "po2": {"name": "O2 permeability", "unit": "Barrer"}, |
| "pch4": {"name": "CH4 permeability", "unit": "Barrer"}, |
| |
| "alpha": {"name": "Polarizability", "unit": "a.u."}, |
| "homo": {"name": "HOMO energy", "unit": "eV"}, |
| "lumo": {"name": "LUMO energy", "unit": "eV"}, |
| "bandgap": {"name": "Band gap", "unit": "eV"}, |
| "mu": {"name": "Dipole moment", "unit": "Debye"}, |
| "etotal": {"name": "Total electronic energy", "unit": "eV"}, |
| "ri": {"name": "Refractive index", "unit": "-"}, |
| "dc": {"name": "Dielectric constant", "unit": "-"}, |
| "pe": {"name": "Permittivity", "unit": "-"}, |
| |
| "rg": {"name": "Radius of gyration", "unit": "脜"}, |
| "rho": {"name": "Density", "unit": "g/cm^3"}, |
| } |
|
|
|
|
| @st.cache_data |
| def load_source_csv(source: str) -> pd.DataFrame: |
| """ |
| Loads data/{SOURCE}.csv, normalizes: |
| - SMILES column -> 'smiles' |
| - property columns -> lowercase |
| - adds 'smiles_canon' |
| """ |
| path = f"data/{source}.csv" |
| df = pd.read_csv(path) |
|
|
| |
| if "SMILES" in df.columns: |
| df = df.rename(columns={"SMILES": "smiles"}) |
| elif "smiles" not in df.columns: |
| raise ValueError(f"{path} missing SMILES column") |
|
|
| |
| rename_map = {c: c.lower() for c in df.columns if c != "smiles"} |
| df = df.rename(columns=rename_map) |
|
|
| |
| df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles) |
| df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True) |
|
|
| return df |
|
|
|
|
| @st.cache_data |
| def build_index(df: pd.DataFrame) -> dict[str, int]: |
| """canonical smiles -> row index (first occurrence)""" |
| idx: dict[str, int] = {} |
| for i, s in enumerate(df["smiles_canon"].tolist()): |
| if s and s not in idx: |
| idx[s] = i |
| return idx |
|
|
|
|
| @st.cache_data |
| def load_polyinfo_csv() -> pd.DataFrame: |
| """ |
| Loads data/POLYINFO.csv with columns: |
| SMILES, Polymer_Class, Polymer_Name |
| Adds canonical smiles column 'smiles_canon'. |
| Returns empty df if file missing. |
| """ |
| try: |
| df = pd.read_csv(POLYINFO_FILE) |
| except Exception: |
| return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"]) |
|
|
| |
| if "SMILES" in df.columns: |
| df = df.rename(columns={"SMILES": "smiles"}) |
| elif "smiles" not in df.columns: |
| |
| return pd.DataFrame(columns=["smiles", "polymer_class", "polymer_name", "smiles_canon"]) |
|
|
| |
| ren = {} |
| if "Polymer_Class" in df.columns: |
| ren["Polymer_Class"] = "polymer_class" |
| if "Polymer_Name" in df.columns: |
| ren["Polymer_Name"] = "polymer_name" |
| df = df.rename(columns=ren) |
|
|
| |
| if "polymer_class" not in df.columns: |
| df["polymer_class"] = pd.NA |
| if "polymer_name" not in df.columns: |
| df["polymer_name"] = pd.NA |
|
|
| |
| df["smiles_canon"] = df["smiles"].astype(str).apply(canonicalize_smiles) |
| df = df.dropna(subset=["smiles_canon"]).reset_index(drop=True) |
|
|
| return df |
|
|
|
|
| @st.cache_data |
| def load_all_sources(): |
| """ |
| Returns dict: |
| db["EXP"/"MD"/"DFT"/"GC"] = {"df": df, "idx": idx} |
| db["POLYINFO"] = {"df": df, "idx": idx} |
| """ |
| db = {} |
| for src in SOURCES: |
| df = load_source_csv(src) |
| idx = build_index(df) |
| db[src] = {"df": df, "idx": idx} |
|
|
| |
| pi_df = load_polyinfo_csv() |
| pi_idx = build_index(pi_df) if not pi_df.empty else {} |
| db["POLYINFO"] = {"df": pi_df, "idx": pi_idx} |
|
|
| return db |
|
|
|
|
| def get_value(db, source: str, smiles_canon: str, prop_key: str): |
| pack = db[source] |
| df, idx = pack["df"], pack["idx"] |
| row_i = idx.get(smiles_canon, None) |
| if row_i is None: |
| return None |
| if prop_key not in df.columns: |
| return None |
| val = df.iloc[row_i][prop_key] |
| if pd.isna(val): |
| return None |
| return float(val) |
|
|
|
|
| def get_polyinfo(db, smiles_canon: str) -> tuple[str | None, str | None]: |
| """ |
| Returns (polymer_name, polymer_class) if available, else (None, None). |
| No 'not available' text here. |
| """ |
| pack = db.get("POLYINFO", None) |
| if pack is None: |
| return None, None |
|
|
| df, idx = pack["df"], pack["idx"] |
| if df is None or df.empty: |
| return None, None |
|
|
| row_i = idx.get(smiles_canon, None) |
| if row_i is None: |
| return None, None |
|
|
| name = df.iloc[row_i].get("polymer_name", None) |
| cls = df.iloc[row_i].get("polymer_class", None) |
|
|
| |
| if pd.isna(name) or str(name).strip() == "": |
| name = None |
| else: |
| name = str(name).strip() |
|
|
| if pd.isna(cls) or str(cls).strip() == "": |
| cls = None |
| else: |
| cls = str(cls).strip() |
|
|
| return name, cls |
|
|