import subprocess subprocess.call("git lfs install", shell=True) import gradio as gr from utils import load_hf_dataset, get_model_and_tokenizer, batch_embed # TODO: add instructor models # "hkunlp/instructor-xl", # "hkunlp/instructor-large", # "hkunlp/instructor-base", models_and_hidden_sizes = [ ("BAAI/bge-small-en-v1.5", 384), ("intfloat/e5-small-v2", 384), ("intfloat/e5-base-v2", 768), ("intfloat/e5-large-v2", 1024), ("intfloat/multilingual-e5-small", 384), ("intfloat/multilingual-e5-base", 768), ("intfloat/multilingual-e5-large", 1024), ("sentence-transformers/all-MiniLM-L6-v2", 384), ("sentence-transformers/all-MiniLM-L12-v2", 384), ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 384), ] model_options = [ f"{model_name} (hidden_size = {hidden_size})" for model_name, hidden_size in models_and_hidden_sizes ] opt2desc = { "O2": "Most precise, slowest (O2: basic and extended general optimizations, transformers-specific fusions)", "O3": "Less precise, faster (O2 + gelu approx)", "O4": "Least precise, fastest (O2 + gelu approx + fp16/bf16)", } desc2opt = {v: k for k, v in opt2desc.items()} optimization_options = list(opt2desc.values()) def run( ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, new_dataset_id, # progress=gr.Progress(), ): print("Loading dataset") ds = load_hf_dataset(ds_name, ds_config, ds_split) print("dataset loaded") opt_level = desc2opt[opt_desc] model_name = model_choice.split()[0] print("Loading model/tokenizer") model, tokenizer = get_model_and_tokenizer(model_name, opt_level) print("Model optimized and loaded") batch_embed( ds, model, tokenizer, model_name=model_name, column_name=column_name, new_dataset_id=new_dataset_id, opt_level=opt_level, # progress=progress, ) return "done" with gr.Blocks(title="Bulk embeddings") as demo: gr.Markdown( """ This Space allows you to embed a large dataset easily. ## Steps 1. Upload the dataset to the Hugging Face Hub. 2. Choose one of the models. These are taken from the top of the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). """ "If you have short documents, O3 will be faster than O4. If you have long documents, O4 will be faster than O3." ) ds_name = gr.Textbox( lines=1, label="Enter dataset to load from Hugging Face Hub", value="nbroad/basic_text_dataset", ) ds_config = gr.Textbox( lines=1, label="Enter dataset config or leave blank to use default", value="" ) column_name = gr.Textbox(lines=1, label="Enter column to embed", value="text") ds_split = gr.Dropdown( choices=["train", "validation", "test"], label="Enter split to embed", value="train", ) model_choice = gr.Dropdown( choices=model_options, label="Enter model to load", value=model_options[0] ) opt_desc = gr.Dropdown( choices=optimization_options, label="Enter optimization level", value=optimization_options[0], ) new_dataset_id = gr.Textbox( lines=1, label="Enter new dataset name, including username", value="nbroad/test-embeds", ) btn = gr.Button(value="Embed texts!") last = gr.Textbox(value="") btn.click( fn=run, inputs=[ ds_name, ds_config, column_name, ds_split, model_choice, opt_desc, new_dataset_id, ], outputs=last, ) if __name__ == "__main__": demo.launch(show_error=True, debug=True)