crystals

Sleeping

App Files Files Community

jannisborn commited on Jan 30, 2023

Commit

63d9b78

•

1 Parent(s): 1ff7fe3

update

Browse files

Files changed (3) hide show

app.py +21 -4
model_cards/article.md +27 -49
model_cards/metal.csv +9 -12

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pathlib
 import shutil
 import tempfile
 from pathlib import Path
 import gradio as gr
 import pandas as pd
@@ -12,7 +13,7 @@ from gt4sd.properties.crystals import CRYSTALS_PROPERTY_PREDICTOR_FACTORY
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
-suffix_dict = {"metal_nonmetal_classifier": ".csv"}
 def create_temp_file(path: str) -> str:
@@ -36,16 +37,33 @@ def main(property: str, data_file: str):
     if data_file is None:
         raise TypeError("You have to pass either an input file for the crystal model")
     # Copy file into a UNIQUE temporary directory
-    file_path = Path(create_temp_file(data_file.name))
     folder = file_path.parent
     print(file_path)
     print(folder)
     if file_path.suffix == ".cif":
         input_path = folder
     elif file_path.suffix == ".csv":
         input_path = file_path
     elif file_path.suffix == ".zip":
         # Unzip zip
         shutil.unpack_archive(file_path, file_path.parent)
         if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
@@ -58,7 +76,6 @@ def main(property: str, data_file: str):
             f" `.cif` files. Not {type(data_file)}."
         )
-    prop_name = property.replace(" ", "_").lower()
     algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
     # Pass hyperparameters if applicable
     kwargs = {"algorithm_version": "v0"}
@@ -80,7 +97,7 @@ if __name__ == "__main__":
     examples = [
         ["Formation Energy", metadata_root.joinpath("7206075.cif")],
         ["Bulk moduli", metadata_root.joinpath("crystals.zip")],
-        # ["Metal Nonmetal Classifier", metadata_root.joinpath("metal.csv")],
         ["Bulk moduli", metadata_root.joinpath("9000046.cif")],
     ]

 import shutil
 import tempfile
 from pathlib import Path
+from collections import defaultdict
 import gradio as gr
 import pandas as pd
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
+suffix_dict = {"metal_nonmetal_classifier": [".csv"]}
 def create_temp_file(path: str) -> str:
     if data_file is None:
         raise TypeError("You have to pass either an input file for the crystal model")
+    prop_name = property.replace(" ", "_").lower()
     # Copy file into a UNIQUE temporary directory
+    if data_file.name.endswith("cfsdfsdsv"):
+        file_path = Path(create_temp_file(data_file.orig_name))
+    else:
+        file_path = Path(create_temp_file(data_file.name))
     folder = file_path.parent
     print(file_path)
     print(folder)
     if file_path.suffix == ".cif":
+        if ".cif" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
+            raise ValueError(
+                f"For this property, provide {suffix_dict[prop_name]}, not `.cif`."
+            )
         input_path = folder
     elif file_path.suffix == ".csv":
+        if ".csv" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
+            raise ValueError(
+                f"For this property, provide {suffix_dict.get(prop_name, ['.cif', '.zip'])}, not `.csv`."
+            )
         input_path = file_path
     elif file_path.suffix == ".zip":
+        if ".zip" not in suffix_dict.get(prop_name, [".cif", ".zip"]):
+            raise ValueError(
+                f"For this property, provide {suffix_dict[prop_name]}, not `.zip`."
+            )
         # Unzip zip
         shutil.unpack_archive(file_path, file_path.parent)
         if len(list(filter(lambda x: x.endswith(".cif"), os.listdir(folder)))) == 0:
             f" `.cif` files. Not {type(data_file)}."
         )
     algo, config = CRYSTALS_PROPERTY_PREDICTOR_FACTORY[prop_name]
     # Pass hyperparameters if applicable
     kwargs = {"algorithm_version": "v0"}
     examples = [
         ["Formation Energy", metadata_root.joinpath("7206075.cif")],
         ["Bulk moduli", metadata_root.joinpath("crystals.zip")],
+        ["Metal Nonmetal Classifier", metadata_root.joinpath("metal.csv")],
         ["Bulk moduli", metadata_root.joinpath("9000046.cif")],
     ]

model_cards/article.md CHANGED Viewed

@@ -2,52 +2,27 @@
 ## Parameters
-### Algorithm Version
-Which model checkpoint to use (trained on different datasets).
-### Task
-Whether the multitask model should be used for property prediction or conditional generation (default).
-### Input
-The input sequence. In the default setting (where `Task` is *Generate* and `Sampling Wrapper` is *True*) this can be a seed SMILES (for the molecule models) or amino-acid sequence (for the protein models). The model will locally adapt the seed sequence by masking `Fraction to mask` of the tokens.
-If the `Task` is *Predict*, the sequences are given as SELFIES for the molecule models. Moreover, the tokens that should be predicted (`[MASK]` in the input) have to be given explicitly. Populate the examples to understand better.
-NOTE: When setting `Task` to *Generate*, and `Sampling Wrapper` to *False*, the user has maximal control about the generative process and can explicitly decide which tokens should be masked.
-### Number of samples
-How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
-### Search
-Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
-### Tolerance
-Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
-NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
-### Sampling Wrapper
-Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
-**NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
-#### Fraction to mask
-Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
-#### Property goal
-Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
-#### Tokens to mask
-Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
-#### Substructures to mask
-Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
-*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
-#### Substructures to keep
-Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
-*NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
-*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
-# Model card -- Regression Transformer
 **Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
@@ -99,15 +74,18 @@ The [Regression Transformer](https://arxiv.org/abs/2202.01338) paper. See the [s
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
-## Citation
 ```bib
-@article{born2022regression,
-  title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
-  author={Born, Jannis and Manica, Matteo},
-  journal={arXiv preprint arXiv:2202.01338},
-  note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
   year={2022}
 }
-```

 ## Parameters
+### Property
+The supported properties are:
+  - `Metal NonMetal Classifier`: Predicted by a RF model (WHICH? )
+  - `Metal Semiconductor Classifier`: Classifying whether a metal could be a semiconductor. Predicted with CGCNN (ToDo: Add Ref!)
+  - `Poisson Ratio`:  ToDo: Description + Reference
+  - `Shear Moduli` ...
+  - `Bulk Moduli`
+  - `Fermi Energy`
+  - `Band Gap`
+  - `Absolute Energy`
+  - `Formation Energy`
+### Input file for crystal model
+The file with information about the metal. Dependent on the property you want to predict, the format of the file differs:
+- `Metal NonMetal Classifier`. It requires a single `.csv` file with the metal (chemical formula) in the first column and the crystal system in the second.
+- **All others**: Predicted with CGCNN. The input can either be a single `.cif` file (to predict a single metal) or a `.zip` folder which contains multiple `.cif` (for batch prediction)
+# Model card - CGCNN
 **Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
+# Model card - RandomForestMetalClassifier
+ToDo...
+# Citation
 ```bib
+@article{manica2022gt4sd,
+  title={GT4SD: Generative Toolkit for Scientific Discovery},
+  author={Manica, Matteo and Cadow, Joris and Christofidellis, Dimitrios and Dave, Ashish and Born, Jannis and Clarke, Dean and Teukam, Yves Gaetan Nana and Hoffman, Samuel C and Buchan, Matthew and Chenthamarakshan, Vijil and others},
+  journal={arXiv preprint arXiv:2207.03928},
   year={2022}
 }
+```

model_cards/metal.csv CHANGED Viewed

@@ -1,4 +1,11 @@
-KPSO2,orthorhombic
 Zr2Ga(PO4)3,trigonal
 Te4Mo(WSe)2,trigonal
 Mo3W(SeS3)2,trigonal
@@ -13,7 +20,6 @@ Te6Mo3WS2,trigonal
 KMg6CO8,tetragonal
 Mg14BiBO16,orthorhombic
 KMg14WO16,tetragonal
-Mg14AlCdO16,orthorhombic
 Mg30VCrO32,tetragonal
 Mg30CoSiO32,tetragonal
 YMg30CO32,tetragonal
@@ -25,9 +31,7 @@ CaMg30NiO32,tetragonal
 LiMg30AlO32,tetragonal
 Mg30AlFeO32,tetragonal
 RbMg30SbO32,tetragonal
-KNaMg30O3orthorhombic
 La7Sm(Fe2O5)4,triclinic
-SrCa3Mn4O1triclinic
 NbNi3(HC)2,tetragonal
 La2P2AuO,monoclinic
 Li9Mn2Co5O16,monoclinic
@@ -40,11 +44,4 @@ LiCr4P7O24,triclinic
 ZnGe(OF)6,trigonal
 Cs2Mo(SO)2,monoclinic
 NaMgSO7,monoclinic
-K2NaNdCl6,cubic
-K2NaBiCl6,cubic
-Na2EuCuCl6,cubic
-NaLi2CoF6,cubic
-K2NaTiF6,cubic
-K2AgRhF6,cubic
-K2CeAgCl6,cubic
-K2ErCuCl6,cubic

+K2NaNdCl6,cubic
+K2NaBiCl6,cubic
+Na2EuCuCl6,cubic
+NaLi2CoF6,cubic
+K2NaTiF6,cubic
+K2AgRhF6,cubic
+K2CeAgCl6,cubic
+K2ErCuCl6,cubic
 Zr2Ga(PO4)3,trigonal
 Te4Mo(WSe)2,trigonal
 Mo3W(SeS3)2,trigonal
 KMg6CO8,tetragonal
 Mg14BiBO16,orthorhombic
 KMg14WO16,tetragonal
 Mg30VCrO32,tetragonal
 Mg30CoSiO32,tetragonal
 YMg30CO32,tetragonal
 LiMg30AlO32,tetragonal
 Mg30AlFeO32,tetragonal
 RbMg30SbO32,tetragonal
 La7Sm(Fe2O5)4,triclinic
 NbNi3(HC)2,tetragonal
 La2P2AuO,monoclinic
 Li9Mn2Co5O16,monoclinic
 ZnGe(OF)6,trigonal
 Cs2Mo(SO)2,monoclinic
 NaMgSO7,monoclinic
+Mg14AlCdO16,orthorhombic