Spaces:
Runtime error
Runtime error
| from enum import Enum | |
| from functools import lru_cache, partial | |
| import json | |
| from pathlib import Path | |
| from typing import Optional, Tuple | |
| import gradio as gr | |
| from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
| import huggingface_hub | |
| from sentence_transformers import CrossEncoder, SentenceTransformer, SparseEncoder | |
| from sentence_transformers import ( | |
| export_dynamic_quantized_onnx_model as st_export_dynamic_quantized_onnx_model, | |
| export_optimized_onnx_model as st_export_optimized_onnx_model, | |
| export_static_quantized_openvino_model as st_export_static_quantized_openvino_model, | |
| ) | |
| from huggingface_hub import ( | |
| model_info, | |
| upload_folder, | |
| get_repo_discussions, | |
| list_repo_commits, | |
| HfFileSystem, | |
| hf_hub_download, | |
| ) | |
| from huggingface_hub.errors import ( | |
| RepositoryNotFoundError, | |
| HFValidationError, | |
| EntryNotFoundError, | |
| ) | |
| from optimum.intel import OVQuantizationConfig | |
| from tempfile import TemporaryDirectory | |
| class Backend(Enum): | |
| # TORCH = "PyTorch" | |
| ONNX = "ONNX" | |
| ONNX_DYNAMIC_QUANTIZATION = "ONNX (Dynamic Quantization)" | |
| ONNX_OPTIMIZATION = "ONNX (Optimization)" | |
| OPENVINO = "OpenVINO" | |
| OPENVINO_STATIC_QUANTIZATION = "OpenVINO (Static Quantization)" | |
| def __str__(self): | |
| return self.value | |
| class Archetype(Enum): | |
| SENTENCE_TRANSFORMER = "SentenceTransformer" | |
| SPARSE_ENCODER = "SparseEncoder" | |
| CROSS_ENCODER = "CrossEncoder" | |
| OTHER = "Other" | |
| def __str__(self): | |
| return self.value | |
| backends = [str(backend) for backend in Backend] | |
| FILE_SYSTEM = HfFileSystem() | |
| def is_new_model(model_id: str) -> bool: | |
| """ | |
| Check if the model ID exists on the Hugging Face Hub. If we get a request error, then we | |
| assume the model *does* exist. | |
| """ | |
| try: | |
| model_info(model_id) | |
| except RepositoryNotFoundError: | |
| return True | |
| except Exception: | |
| pass | |
| return False | |
| def is_sentence_transformer_model(model_id: str) -> bool: | |
| return "sentence-transformers" in model_info(model_id).tags | |
| def get_archetype(model_id: str) -> Archetype: | |
| if "/" not in model_id: | |
| return Archetype.OTHER | |
| try: | |
| config_sentence_transformers_path = hf_hub_download( | |
| model_id, filename="config_sentence_transformers.json" | |
| ) | |
| except (RepositoryNotFoundError, HFValidationError): | |
| return Archetype.OTHER | |
| except EntryNotFoundError: | |
| config_sentence_transformers_path = None | |
| try: | |
| config_path = hf_hub_download(model_id, filename="config.json") | |
| except (RepositoryNotFoundError, HFValidationError): | |
| return Archetype.OTHER | |
| except EntryNotFoundError: | |
| config_path = None | |
| if config_sentence_transformers_path is None and config_path is None: | |
| return Archetype.OTHER | |
| if config_sentence_transformers_path is not None: | |
| with open(config_sentence_transformers_path, "r", encoding="utf8") as f: | |
| st_config = json.load(f) | |
| model_type = st_config.get("model_type", "SentenceTransformer") | |
| if model_type == "SentenceTransformer": | |
| return Archetype.SENTENCE_TRANSFORMER | |
| elif model_type == "SparseEncoder": | |
| return Archetype.SPARSE_ENCODER | |
| else: | |
| return Archetype.OTHER | |
| if config_path is not None: | |
| with open(config_path, "r", encoding="utf8") as f: | |
| config = json.load(f) | |
| if "sentence_transformers" in config or config["architectures"][0].endswith( | |
| "ForSequenceClassification" | |
| ): | |
| return Archetype.CROSS_ENCODER | |
| return Archetype.OTHER | |
| def get_last_commit(model_id: str) -> str: | |
| """ | |
| Get the last commit hash of the model ID. | |
| """ | |
| return f"https://huggingface.co/{model_id}/commit/{list_repo_commits(model_id)[0].commit_id}" | |
| def get_last_pr(model_id: str) -> Tuple[str, int]: | |
| last_pr = next(get_repo_discussions(model_id)) | |
| return last_pr.url, last_pr.num | |
| def does_file_glob_exist(repo_id: str, glob: str) -> bool: | |
| """ | |
| Check if a file glob exists in the repository. | |
| """ | |
| try: | |
| return bool(FILE_SYSTEM.glob(f"{repo_id}/{glob}", detail=False)) | |
| except FileNotFoundError: | |
| return False | |
| def export_to_torch(model_id, create_pr, output_model_id): | |
| model = SentenceTransformer(model_id, backend="torch") | |
| model.push_to_hub( | |
| repo_id=output_model_id, | |
| create_pr=create_pr, | |
| exist_ok=True, | |
| ) | |
| def export_to_onnx( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| if does_file_glob_exist(output_model_id, "**/model.onnx"): | |
| raise FileExistsError("An ONNX model already exists in the repository") | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer(model_id, backend="onnx") | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder(model_id, backend="onnx") | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder(model_id, backend="onnx") | |
| else: | |
| return | |
| commit_message = "Add exported onnx model 'model.onnx'" | |
| if is_new_model(output_model_id): | |
| model.push_to_hub( | |
| repo_id=output_model_id, | |
| commit_message=commit_message, | |
| create_pr=create_pr, | |
| token=token, | |
| ) | |
| else: | |
| with TemporaryDirectory() as tmp_dir: | |
| model.save_pretrained(tmp_dir) | |
| commit_description = f""" | |
| Hello! | |
| *This pull request has been automatically generated from the [Sentence Transformers backend-export](https://huggingface.co/spaces/sentence-transformers/backend-export) Space.* | |
| ## Pull Request overview | |
| * Add exported ONNX model `model.onnx`. | |
| ## Tip: | |
| Consider testing this pull request before merging by loading the model from this PR with the `revision` argument: | |
| ```python | |
| from sentence_transformers import {archetype} | |
| # TODO: Fill in the PR number | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="onnx", | |
| ) | |
| # Verify that everything works as expected | |
| {'''embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| print(embeddings.shape) | |
| similarities = model.similarity(embeddings, embeddings) | |
| print(similarities)''' if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} else | |
| '''predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| print(predictions)'''} | |
| ``` | |
| """ | |
| upload_folder( | |
| repo_id=output_model_id, | |
| folder_path=Path(tmp_dir) / "onnx", | |
| path_in_repo="onnx", | |
| commit_message=commit_message, | |
| commit_description=commit_description if create_pr else None, | |
| create_pr=create_pr, | |
| token=token, | |
| ) | |
| def export_to_onnx_snippet( | |
| model_id: str, archetype: Archetype, create_pr: bool, output_model_id: str | |
| ) -> Tuple[str, str, str]: | |
| if archetype == Archetype.OTHER: | |
| return "", "", "" | |
| return ( | |
| """\ | |
| pip install sentence_transformers[onnx-gpu] | |
| # or | |
| pip install sentence_transformers[onnx] | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model to be exported with the ONNX backend | |
| model = {archetype}( | |
| "{model_id}", | |
| backend="onnx", | |
| ) | |
| # 2. Push the model to the Hugging Face Hub | |
| {f'model.push_to_hub("{output_model_id}")' | |
| if not create_pr | |
| else f'''model.push_to_hub( | |
| "{output_model_id}", | |
| create_pr=True, | |
| )'''} | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model from the Hugging Face Hub | |
| # (until merged) Use the `revision` argument to load the model from the PR | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="onnx", | |
| ) | |
| """ | |
| + ( | |
| """ | |
| # 2. Inference works as normal | |
| embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| similarities = model.similarity(embeddings, embeddings) | |
| """ | |
| if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} | |
| else """ | |
| # 2. Inference works as normal | |
| predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| """ | |
| ), | |
| ) | |
| def export_to_onnx_dynamic_quantization( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| onnx_quantization_config: str, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| if does_file_glob_exist( | |
| output_model_id, f"onnx/model_qint8_{onnx_quantization_config}.onnx" | |
| ): | |
| raise FileExistsError( | |
| "The quantized ONNX model already exists in the repository" | |
| ) | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer(model_id, backend="onnx") | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder(model_id, backend="onnx") | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder(model_id, backend="onnx") | |
| else: | |
| return | |
| if not create_pr and is_new_model(output_model_id): | |
| model.push_to_hub(repo_id=output_model_id, token=token) | |
| # Monkey-patch the upload_folder function to include the token, as it's not used in export_dynamic_quantized_onnx_model | |
| original_upload_folder = huggingface_hub.upload_folder | |
| huggingface_hub.upload_folder = partial(original_upload_folder, token=token) | |
| try: | |
| st_export_dynamic_quantized_onnx_model( | |
| model, | |
| quantization_config=onnx_quantization_config, | |
| model_name_or_path=output_model_id, | |
| push_to_hub=True, | |
| create_pr=create_pr, | |
| ) | |
| except ValueError: | |
| # Currently, quantization with optimum has some issues if there's already an ONNX model in a subfolder | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer( | |
| model_id, backend="onnx", model_kwargs={"export": True} | |
| ) | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder( | |
| model_id, backend="onnx", model_kwargs={"export": True} | |
| ) | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder( | |
| model_id, backend="onnx", model_kwargs={"export": True} | |
| ) | |
| else: | |
| return | |
| st_export_dynamic_quantized_onnx_model( | |
| model, | |
| quantization_config=onnx_quantization_config, | |
| model_name_or_path=output_model_id, | |
| push_to_hub=True, | |
| create_pr=create_pr, | |
| ) | |
| finally: | |
| huggingface_hub.upload_folder = original_upload_folder | |
| def export_to_onnx_dynamic_quantization_snippet( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| onnx_quantization_config: str, | |
| ) -> Tuple[str, str, str]: | |
| if archetype == Archetype.OTHER: | |
| return "", "", "" | |
| return ( | |
| """\ | |
| pip install sentence_transformers[onnx-gpu] | |
| # or | |
| pip install sentence_transformers[onnx] | |
| """, | |
| f"""\ | |
| from sentence_transformers import ( | |
| {archetype}, | |
| export_dynamic_quantized_onnx_model, | |
| ) | |
| # 1. Load the model to be exported with the ONNX backend | |
| model = {archetype}( | |
| "{model_id}", | |
| backend="onnx", | |
| ) | |
| # 2. Export the model with {onnx_quantization_config} dynamic quantization | |
| export_dynamic_quantized_onnx_model( | |
| model, | |
| quantization_config="{onnx_quantization_config}", | |
| model_name_or_path="{output_model_id}", | |
| push_to_hub=True, | |
| {''' create_pr=True, | |
| ''' if create_pr else ''}) | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model from the Hugging Face Hub | |
| # (until merged) Use the `revision` argument to load the model from the PR | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="onnx", | |
| model_kwargs={{"file_name": "model_qint8_{onnx_quantization_config}.onnx"}}, | |
| ) | |
| """ | |
| + ( | |
| """ | |
| # 2. Inference works as normal | |
| embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| similarities = model.similarity(embeddings, embeddings) | |
| """ | |
| if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} | |
| else """ | |
| # 2. Inference works as normal | |
| predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| """ | |
| ), | |
| ) | |
| def export_to_onnx_optimization( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| onnx_optimization_config: str, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| if does_file_glob_exist( | |
| output_model_id, f"onnx/model_{onnx_optimization_config}.onnx" | |
| ): | |
| raise FileExistsError( | |
| "The optimized ONNX model already exists in the repository" | |
| ) | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer(model_id, backend="onnx") | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder(model_id, backend="onnx") | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder(model_id, backend="onnx") | |
| else: | |
| return | |
| if not create_pr and is_new_model(output_model_id): | |
| model.push_to_hub(repo_id=output_model_id, token=token) | |
| # Monkey-patch the upload_folder function to include the token, as it's not used in export_optimized_onnx_model | |
| original_upload_folder = huggingface_hub.upload_folder | |
| huggingface_hub.upload_folder = partial(original_upload_folder, token=token) | |
| try: | |
| st_export_optimized_onnx_model( | |
| model, | |
| optimization_config=onnx_optimization_config, | |
| model_name_or_path=output_model_id, | |
| push_to_hub=True, | |
| create_pr=create_pr, | |
| ) | |
| finally: | |
| huggingface_hub.upload_folder = original_upload_folder | |
| def export_to_onnx_optimization_snippet( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| onnx_optimization_config: str, | |
| ) -> Tuple[str, str, str]: | |
| if archetype == Archetype.OTHER: | |
| return "", "", "" | |
| return ( | |
| """\ | |
| pip install sentence_transformers[onnx-gpu] | |
| # or | |
| pip install sentence_transformers[onnx] | |
| """, | |
| f"""\ | |
| from sentence_transformers import ( | |
| {archetype}, | |
| export_optimized_onnx_model, | |
| ) | |
| # 1. Load the model to be optimized with the ONNX backend | |
| model = {archetype}( | |
| "{model_id}", | |
| backend="onnx", | |
| ) | |
| # 2. Export the model with {onnx_optimization_config} optimization level | |
| export_optimized_onnx_model( | |
| model, | |
| optimization_config="{onnx_optimization_config}", | |
| model_name_or_path="{output_model_id}", | |
| push_to_hub=True, | |
| {''' create_pr=True, | |
| ''' if create_pr else ''}) | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model from the Hugging Face Hub | |
| # (until merged) Use the `revision` argument to load the model from the PR | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="onnx", | |
| model_kwargs={{"file_name": "model_{onnx_optimization_config}.onnx"}}, | |
| ) | |
| """ | |
| + ( | |
| """ | |
| # 2. Inference works as normal | |
| embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| similarities = model.similarity(embeddings, embeddings) | |
| """ | |
| if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} | |
| else """ | |
| # 2. Inference works as normal | |
| predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| """ | |
| ), | |
| ) | |
| def export_to_openvino( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| if does_file_glob_exist(output_model_id, "**/openvino_model.xml"): | |
| raise FileExistsError("The OpenVINO model already exists in the repository") | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer(model_id, backend="openvino") | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder(model_id, backend="openvino") | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder(model_id, backend="openvino") | |
| else: | |
| return | |
| commit_message = "Add exported openvino model 'openvino_model.xml'" | |
| if is_new_model(output_model_id): | |
| model.push_to_hub( | |
| repo_id=output_model_id, | |
| commit_message=commit_message, | |
| create_pr=create_pr, | |
| token=token, | |
| ) | |
| else: | |
| with TemporaryDirectory() as tmp_dir: | |
| model.save_pretrained(tmp_dir) | |
| commit_description = f""" | |
| Hello! | |
| *This pull request has been automatically generated from the [Sentence Transformers backend-export](https://huggingface.co/spaces/sentence-transformers/backend-export) Space.* | |
| ## Pull Request overview | |
| * Add exported OpenVINO model `openvino_model.xml`. | |
| ## Tip: | |
| Consider testing this pull request before merging by loading the model from this PR with the `revision` argument: | |
| ```python | |
| from sentence_transformers import {archetype} | |
| # TODO: Fill in the PR number | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="openvino", | |
| ) | |
| # Verify that everything works as expected | |
| {'''embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| print(embeddings.shape) | |
| similarities = model.similarity(embeddings, embeddings) | |
| print(similarities)''' if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} else | |
| '''predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| print(predictions)'''} | |
| ``` | |
| """ | |
| upload_folder( | |
| repo_id=output_model_id, | |
| folder_path=Path(tmp_dir) / "openvino", | |
| path_in_repo="openvino", | |
| commit_message=commit_message, | |
| commit_description=commit_description if create_pr else None, | |
| create_pr=create_pr, | |
| token=token, | |
| ) | |
| def export_to_openvino_snippet( | |
| model_id: str, archetype: Archetype, create_pr: bool, output_model_id: str | |
| ) -> Tuple[str, str, str]: | |
| if archetype == Archetype.OTHER: | |
| return "", "", "" | |
| return ( | |
| """\ | |
| pip install sentence_transformers[openvino] | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model to be exported with the OpenVINO backend | |
| model = {archetype}( | |
| "{model_id}", | |
| backend="openvino", | |
| ) | |
| # 2. Push the model to the Hugging Face Hub | |
| {f'model.push_to_hub("{output_model_id}")' | |
| if not create_pr | |
| else f'''model.push_to_hub( | |
| "{output_model_id}", | |
| create_pr=True, | |
| )'''} | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model from the Hugging Face Hub | |
| # (until merged) Use the `revision` argument to load the model from the PR | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="openvino", | |
| ) | |
| """ | |
| + ( | |
| """ | |
| # 2. Inference works as normal | |
| embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| similarities = model.similarity(embeddings, embeddings) | |
| """ | |
| if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} | |
| else """ | |
| # 2. Inference works as normal | |
| predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| """ | |
| ), | |
| ) | |
| def export_to_openvino_static_quantization( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| ov_quant_dataset_name: str, | |
| ov_quant_dataset_subset: str, | |
| ov_quant_dataset_split: str, | |
| ov_quant_dataset_column_name: str, | |
| ov_quant_dataset_num_samples: int, | |
| token: Optional[str] = None, | |
| ) -> None: | |
| if does_file_glob_exist( | |
| output_model_id, "openvino/openvino_model_qint8_quantized.xml" | |
| ): | |
| raise FileExistsError( | |
| "The quantized OpenVINO model already exists in the repository" | |
| ) | |
| if archetype == Archetype.SENTENCE_TRANSFORMER: | |
| model = SentenceTransformer(model_id, backend="openvino") | |
| elif archetype == Archetype.SPARSE_ENCODER: | |
| model = SparseEncoder(model_id, backend="openvino") | |
| elif archetype == Archetype.CROSS_ENCODER: | |
| model = CrossEncoder(model_id, backend="openvino") | |
| else: | |
| return | |
| if not create_pr and is_new_model(output_model_id): | |
| model.push_to_hub(repo_id=output_model_id, token=token) | |
| # Monkey-patch the upload_folder function to include the token, as it's not used in export_static_quantized_openvino_model | |
| original_upload_folder = huggingface_hub.upload_folder | |
| huggingface_hub.upload_folder = partial(original_upload_folder, token=token) | |
| try: | |
| st_export_static_quantized_openvino_model( | |
| model, | |
| quantization_config=OVQuantizationConfig( | |
| num_samples=ov_quant_dataset_num_samples, | |
| ), | |
| model_name_or_path=output_model_id, | |
| dataset_name=ov_quant_dataset_name, | |
| dataset_config_name=ov_quant_dataset_subset, | |
| dataset_split=ov_quant_dataset_split, | |
| column_name=ov_quant_dataset_column_name, | |
| push_to_hub=True, | |
| create_pr=create_pr, | |
| ) | |
| finally: | |
| huggingface_hub.upload_folder = original_upload_folder | |
| def export_to_openvino_static_quantization_snippet( | |
| model_id: str, | |
| archetype: Archetype, | |
| create_pr: bool, | |
| output_model_id: str, | |
| ov_quant_dataset_name: str, | |
| ov_quant_dataset_subset: str, | |
| ov_quant_dataset_split: str, | |
| ov_quant_dataset_column_name: str, | |
| ov_quant_dataset_num_samples: int, | |
| ) -> Tuple[str, str, str]: | |
| if archetype == Archetype.OTHER: | |
| return "", "", "" | |
| return ( | |
| """\ | |
| pip install sentence_transformers[openvino] | |
| """, | |
| f"""\ | |
| from sentence_transformers import ( | |
| {archetype}, | |
| export_static_quantized_openvino_model, | |
| ) | |
| from optimum.intel import OVQuantizationConfig | |
| # 1. Load the model to be quantized with the OpenVINO backend | |
| model = {archetype}( | |
| "{model_id}", | |
| backend="openvino", | |
| ) | |
| # 2. Export the model with int8 static quantization | |
| export_static_quantized_openvino_model( | |
| model, | |
| quantization_config=OVQuantizationConfig( | |
| num_samples={ov_quant_dataset_num_samples}, | |
| ), | |
| model_name_or_path="{output_model_id}", | |
| dataset_name="{ov_quant_dataset_name}", | |
| dataset_config_name="{ov_quant_dataset_subset}", | |
| dataset_split="{ov_quant_dataset_split}", | |
| column_name="{ov_quant_dataset_column_name}", | |
| push_to_hub=True, | |
| {''' create_pr=True, | |
| ''' if create_pr else ''}) | |
| """, | |
| f"""\ | |
| from sentence_transformers import {archetype} | |
| # 1. Load the model from the Hugging Face Hub | |
| # (until merged) Use the `revision` argument to load the model from the PR | |
| pr_number = 2 | |
| model = {archetype}( | |
| "{output_model_id}", | |
| revision=f"refs/pr/{{pr_number}}", | |
| backend="openvino", | |
| model_kwargs={{"file_name": "openvino_model_qint8_quantized.xml"}}, | |
| ) | |
| """ | |
| + ( | |
| """ | |
| # 2. Inference works as normal | |
| embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) | |
| similarities = model.similarity(embeddings, embeddings) | |
| """ | |
| if archetype in {Archetype.SENTENCE_TRANSFORMER, Archetype.SPARSE_ENCODER} | |
| else """ | |
| # 2. Inference works as normal | |
| predictions = model.predict([ | |
| ["Which planet is known as the Red Planet?", "Mars, known for its reddish appearance, is often referred to as the Red Planet."], | |
| ["Which planet is known as the Red Planet?", "Jupiter, the largest planet in our solar system, has a prominent red spot."], | |
| ]) | |
| """ | |
| ), | |
| ) | |
| def on_submit( | |
| model_id, | |
| create_pr, | |
| output_model_id, | |
| backend, | |
| onnx_quantization_config, | |
| onnx_optimization_config, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| inference_snippet: str, | |
| oauth_token: Optional[gr.OAuthToken] = None, | |
| profile: Optional[gr.OAuthProfile] = None, | |
| ): | |
| if oauth_token is None or profile is None: | |
| return ( | |
| "Commit or PR url:<br>...", | |
| inference_snippet, | |
| gr.Textbox( | |
| "Please sign in with Hugging Face to use this Space", visible=True | |
| ), | |
| ) | |
| if not model_id: | |
| return ( | |
| "Commit or PR url:<br>...", | |
| inference_snippet, | |
| gr.Textbox("Please enter a model ID", visible=True), | |
| ) | |
| if not is_sentence_transformer_model(model_id): | |
| return ( | |
| "Commit or PR url:<br>...", | |
| inference_snippet, | |
| gr.Textbox( | |
| "The source model must have a Sentence Transformers tag", visible=True | |
| ), | |
| ) | |
| if output_model_id and "/" not in output_model_id: | |
| output_model_id = f"{profile.name}/{output_model_id}" | |
| output_model_id = output_model_id if not create_pr else model_id | |
| archetype = get_archetype(model_id) | |
| try: | |
| if backend == Backend.ONNX.value: | |
| export_to_onnx( | |
| model_id, archetype, create_pr, output_model_id, token=oauth_token.token | |
| ) | |
| elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value: | |
| export_to_onnx_dynamic_quantization( | |
| model_id, | |
| archetype, | |
| create_pr, | |
| output_model_id, | |
| onnx_quantization_config, | |
| token=oauth_token.token, | |
| ) | |
| elif backend == Backend.ONNX_OPTIMIZATION.value: | |
| export_to_onnx_optimization( | |
| model_id, | |
| archetype, | |
| create_pr, | |
| output_model_id, | |
| onnx_optimization_config, | |
| token=oauth_token.token, | |
| ) | |
| elif backend == Backend.OPENVINO.value: | |
| export_to_openvino( | |
| model_id, archetype, create_pr, output_model_id, token=oauth_token.token | |
| ) | |
| elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value: | |
| export_to_openvino_static_quantization( | |
| model_id, | |
| archetype, | |
| create_pr, | |
| output_model_id, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| token=oauth_token.token, | |
| ) | |
| except FileExistsError as exc: | |
| return ( | |
| "Commit or PR url:<br>...", | |
| inference_snippet, | |
| gr.Textbox(str(exc), visible=True), | |
| ) | |
| if create_pr: | |
| url, num = get_last_pr(output_model_id) | |
| return ( | |
| f"PR url:<br>{url}", | |
| inference_snippet.replace("pr_number = 2", f"pr_number = {num}"), | |
| gr.Textbox(visible=False), | |
| ) | |
| # Remove the lines that refer to the revision argument | |
| lines = inference_snippet.splitlines() | |
| del lines[7] | |
| del lines[4] | |
| del lines[3] | |
| inference_snippet = "\n".join(lines) | |
| return ( | |
| f"Commit url:<br>{get_last_commit(output_model_id)}", | |
| inference_snippet, | |
| gr.Textbox(visible=False), | |
| ) | |
| def on_change( | |
| model_id, | |
| create_pr, | |
| output_model_id, | |
| backend, | |
| onnx_quantization_config, | |
| onnx_optimization_config, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| oauth_token: Optional[gr.OAuthToken] = None, | |
| profile: Optional[gr.OAuthProfile] = None, | |
| ) -> str: | |
| if oauth_token is None or profile is None: | |
| return ( | |
| "", | |
| "", | |
| "", | |
| gr.Textbox( | |
| "Please sign in with Hugging Face to use this Space", visible=True | |
| ), | |
| ) | |
| if not model_id: | |
| return "", "", "", gr.Textbox("Please enter a model ID", visible=True) | |
| if output_model_id and "/" not in output_model_id: | |
| output_model_id = f"{profile.username}/{output_model_id}" | |
| output_model_id = output_model_id if not create_pr else model_id | |
| archetype = get_archetype(model_id) | |
| if backend == Backend.ONNX.value: | |
| snippets = export_to_onnx_snippet( | |
| model_id, archetype, create_pr, output_model_id | |
| ) | |
| elif backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value: | |
| snippets = export_to_onnx_dynamic_quantization_snippet( | |
| model_id, archetype, create_pr, output_model_id, onnx_quantization_config | |
| ) | |
| elif backend == Backend.ONNX_OPTIMIZATION.value: | |
| snippets = export_to_onnx_optimization_snippet( | |
| model_id, archetype, create_pr, output_model_id, onnx_optimization_config | |
| ) | |
| elif backend == Backend.OPENVINO.value: | |
| snippets = export_to_openvino_snippet( | |
| model_id, archetype, create_pr, output_model_id | |
| ) | |
| elif backend == Backend.OPENVINO_STATIC_QUANTIZATION.value: | |
| snippets = export_to_openvino_static_quantization_snippet( | |
| model_id, | |
| archetype, | |
| create_pr, | |
| output_model_id, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| ) | |
| else: | |
| return "", "", "", gr.Textbox("Unexpected backend!", visible=True) | |
| return *snippets, gr.Textbox(visible=False) | |
| css = """ | |
| .container { | |
| padding-left: 0; | |
| } | |
| div:has(> div.text-error) { | |
| border-color: var(--error-border-color); | |
| } | |
| .small-text * { | |
| font-size: var(--block-info-text-size); | |
| } | |
| """ | |
| with gr.Blocks( | |
| css=css, | |
| theme=gr.themes.Base(), | |
| ) as demo: | |
| gr.LoginButton(min_width=250) | |
| with gr.Row(): | |
| # Left Input Column | |
| with gr.Column(scale=2): | |
| gr.Markdown( | |
| value="""\ | |
| ### Export a SentenceTransformer, SparseEncoder, or CrossEncoder model to accelerated backends | |
| Sentence Transformers models can be optimized for **faster inference** on CPU and GPU devices by exporting, quantizing, and optimizing them in ONNX and OpenVINO formats. | |
| Observe the Speeding up Inference documentation for more information: | |
| * [SentenceTransformer > Speeding up Inference](https://sbert.net/docs/sentence_transformer/usage/efficiency.html) | |
| * [SparseEncoder > Speeding up Inference](https://sbert.net/docs/sparse_encoder/usage/efficiency.html) | |
| * [CrossEncoder > Speeding up Inference](https://sbert.net/docs/cross_encoder/usage/efficiency.html) | |
| """, | |
| label="", | |
| container=True, | |
| ) | |
| gr.HTML( | |
| value="""\ | |
| <details><summary>Click to see performance benchmarks</summary> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>SentenceTransformer GPU</th> | |
| <th>SentenceTransformer CPU</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td> | |
| <img src="https://sbert.net/_images/backends_benchmark_gpu.png" alt=""> | |
| </td> | |
| <td> | |
| <img src="https://sbert.net/_images/backends_benchmark_cpu.png" alt=""> | |
| </td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>SparseEncoder GPU</th> | |
| <th>SparseEncoder CPU</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td> | |
| <img src="https://sbert.net/_images/se_backends_benchmark_gpu.png" alt=""> | |
| </td> | |
| <td> | |
| <img src="https://sbert.net/_images/se_backends_benchmark_cpu.png" alt=""> | |
| </td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>CrossEncoder GPU</th> | |
| <th>CrossEncoder CPU</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td> | |
| <img src="https://sbert.net/_images/ce_backends_benchmark_gpu.png" alt=""> | |
| </td> | |
| <td> | |
| <img src="https://sbert.net/_images/ce_backends_benchmark_cpu.png" alt=""> | |
| </td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <ul> | |
| <li><code>onnx</code> refers to the ONNX backend</li> | |
| <li><code>onnx-qint8</code> refers to ONNX (Dynamic Quantization)</li> | |
| <li><code>onnx-O1</code> to <code>onnx-O4</code> refers to ONNX (Optimization)</li> | |
| <li><code>openvino</code> refers to the OpenVINO backend</li> | |
| <li><code>openvino-qint8</code> refers to OpenVINO (Static Quantization)</li> | |
| </ul> | |
| </details> | |
| """ | |
| ) | |
| model_id = HuggingfaceHubSearch( | |
| label="SentenceTransformer, SparseEncoder, or CrossEncoder model to export", | |
| placeholder="Search for SentenceTransformer, SparseEncoder, or CrossEncoder models on Hugging Face", | |
| search_type="model", | |
| ) | |
| create_pr = gr.Checkbox( | |
| value=True, | |
| label="Create PR", | |
| info="Create a pull request instead of pushing directly to a repository", | |
| ) | |
| output_model_id = gr.Textbox( | |
| value="", | |
| label="Model repository to write to", | |
| placeholder="Model ID", | |
| type="text", | |
| visible=False, | |
| ) | |
| create_pr.change( | |
| lambda create_pr: gr.Textbox(visible=not create_pr), | |
| inputs=[create_pr], | |
| outputs=[output_model_id], | |
| ) | |
| backend = gr.Radio( | |
| choices=backends, | |
| value=Backend.ONNX, | |
| label="Backend", | |
| ) | |
| with gr.Group(visible=True) as onnx_group: | |
| gr.Markdown( | |
| value="[ONNX Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx)", | |
| container=True, | |
| elem_classes=["small-text"], | |
| ) | |
| with gr.Group(visible=False) as onnx_dynamic_quantization_group: | |
| onnx_quantization_config = gr.Radio( | |
| choices=["arm64", "avx2", "avx512", "avx512_vnni"], | |
| value="avx512_vnni", | |
| label="Quantization config", | |
| info="[ONNX Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-onnx-models)", | |
| ) | |
| with gr.Group(visible=False) as onnx_optimization_group: | |
| onnx_optimization_config = gr.Radio( | |
| choices=["O1", "O2", "O3", "O4"], | |
| value="O4", | |
| label="Optimization config", | |
| info="[ONNX Optimization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#optimizing-onnx-models)", | |
| ) | |
| with gr.Group(visible=False) as openvino_group: | |
| gr.Markdown( | |
| value="[OpenVINO Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#openvino)", | |
| container=True, | |
| elem_classes=["small-text"], | |
| ) | |
| with gr.Group(visible=False) as openvino_static_quantization_group: | |
| gr.Markdown( | |
| value="[OpenVINO Quantization Documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#quantizing-openvino-models)", | |
| container=True, | |
| elem_classes=["small-text"], | |
| ) | |
| ov_quant_dataset_name = HuggingfaceHubSearch( | |
| value="nyu-mll/glue", | |
| label="Calibration Dataset Name", | |
| placeholder="Search for Sentence Transformer datasets on Hugging Face", | |
| search_type="dataset", | |
| ) | |
| ov_quant_dataset_subset = gr.Textbox( | |
| value="sst2", | |
| label="Calibration Dataset Subset", | |
| placeholder="Calibration Dataset Subset", | |
| type="text", | |
| max_lines=1, | |
| ) | |
| ov_quant_dataset_split = gr.Textbox( | |
| value="train", | |
| label="Calibration Dataset Split", | |
| placeholder="Calibration Dataset Split", | |
| type="text", | |
| max_lines=1, | |
| ) | |
| ov_quant_dataset_column_name = gr.Textbox( | |
| value="sentence", | |
| label="Calibration Dataset Column Name", | |
| placeholder="Calibration Dataset Column Name", | |
| type="text", | |
| max_lines=1, | |
| ) | |
| ov_quant_dataset_num_samples = gr.Number( | |
| value=300, | |
| label="Calibration Dataset Num Samples", | |
| ) | |
| backend.change( | |
| lambda backend: ( | |
| ( | |
| gr.Group(visible=True) | |
| if backend == Backend.ONNX.value | |
| else gr.Group(visible=False) | |
| ), | |
| ( | |
| gr.Group(visible=True) | |
| if backend == Backend.ONNX_DYNAMIC_QUANTIZATION.value | |
| else gr.Group(visible=False) | |
| ), | |
| ( | |
| gr.Group(visible=True) | |
| if backend == Backend.ONNX_OPTIMIZATION.value | |
| else gr.Group(visible=False) | |
| ), | |
| ( | |
| gr.Group(visible=True) | |
| if backend == Backend.OPENVINO.value | |
| else gr.Group(visible=False) | |
| ), | |
| ( | |
| gr.Group(visible=True) | |
| if backend == Backend.OPENVINO_STATIC_QUANTIZATION.value | |
| else gr.Group(visible=False) | |
| ), | |
| ), | |
| inputs=[backend], | |
| outputs=[ | |
| onnx_group, | |
| onnx_dynamic_quantization_group, | |
| onnx_optimization_group, | |
| openvino_group, | |
| openvino_static_quantization_group, | |
| ], | |
| ) | |
| submit_button = gr.Button( | |
| "Export Model", | |
| variant="primary", | |
| ) | |
| # Right Input Column | |
| with gr.Column(scale=1): | |
| error = gr.Textbox( | |
| value="", | |
| label="Error", | |
| type="text", | |
| visible=False, | |
| max_lines=1, | |
| interactive=False, | |
| elem_classes=["text-error"], | |
| ) | |
| requirements = gr.Code( | |
| value="", | |
| language="shell", | |
| label="Requirements", | |
| lines=1, | |
| ) | |
| export_snippet = gr.Code( | |
| value="", | |
| language="python", | |
| label="Export Snippet", | |
| ) | |
| inference_snippet = gr.Code( | |
| value="", | |
| language="python", | |
| label="Inference Snippet", | |
| ) | |
| url = gr.Markdown( | |
| value="Commit or PR url:<br>...", | |
| label="", | |
| container=True, | |
| visible=True, | |
| ) | |
| submit_button.click( | |
| on_submit, | |
| inputs=[ | |
| model_id, | |
| create_pr, | |
| output_model_id, | |
| backend, | |
| onnx_quantization_config, | |
| onnx_optimization_config, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| inference_snippet, | |
| ], | |
| outputs=[url, inference_snippet, error], | |
| ) | |
| for input_component in [ | |
| model_id, | |
| create_pr, | |
| output_model_id, | |
| backend, | |
| onnx_quantization_config, | |
| onnx_optimization_config, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| ]: | |
| input_component.change( | |
| on_change, | |
| inputs=[ | |
| model_id, | |
| create_pr, | |
| output_model_id, | |
| backend, | |
| onnx_quantization_config, | |
| onnx_optimization_config, | |
| ov_quant_dataset_name, | |
| ov_quant_dataset_subset, | |
| ov_quant_dataset_split, | |
| ov_quant_dataset_column_name, | |
| ov_quant_dataset_num_samples, | |
| ], | |
| outputs=[requirements, export_snippet, inference_snippet, error], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) | |