import os from collections import defaultdict import gradio as gr from dotenv import load_dotenv from huggingface_hub import ( add_collection_item, create_collection, list_datasets, ) from toolz import unique from tqdm.auto import tqdm load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") LIMIT = None NAMESPACE = "DIBT" def extract_languages(dataset_info): return [tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")] def create_dataset_info(): all_datasets = list(tqdm(list_datasets(full=True, limit=LIMIT))) all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id] dpo_in_name = [ dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id ] dpo_in_tags = [ dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags) ] all_dpo_datasets = dpo_in_name + dpo_in_tags dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id)) dpo_datasets = [d for d in dpo_datasets if d.card_data is not None] dpo_datasets_with_languages = [ dpo_dataset for dpo_dataset in dpo_datasets if dpo_dataset.card_data.get("language") is not None ] language_groups = defaultdict(list) for dataset in dpo_datasets_with_languages: languages = extract_languages(dataset) for language in languages: language_groups[language].append(dataset) return language_groups def create_update_collections(language_groups): collections = {} for language, dataset_list in language_groups.items(): collection_title = f"DPO datasets for {language.upper()}" collection = create_collection( title=collection_title, description=f"A collection of DPO datasets for the {language.upper()} language.", exists_ok=True, namespace=NAMESPACE, token=HF_TOKEN, ) existing_items = {item.item_id for item in collection.items} for dataset in dataset_list: if dataset.id not in existing_items: add_collection_item( collection.slug, item_id=dataset.id, item_type="dataset", token=HF_TOKEN ) collections[language] = collection return collections def display_datasets(language): if language not in datasets: return "No datasets found for the selected language." dataset_list = datasets[language] collection = collections[language] output = f"## Datasets for {language.upper()}\n\n" output += f"Total datasets: {len(dataset_list)}\n\n" output += f"View Hugging Face [Collection](https://huggingface.co/collections/{collection.slug}) for language.\n\n" for dataset in dataset_list: output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n" return output def display_overview(): total_datasets = sum(len(datasets) for datasets in datasets.values()) total_languages = len(datasets) overview = "## Dataset Overview\n\n" overview += f"- Total number of datasets: {total_datasets}\n" overview += f"- Total number of languages covered: {total_languages}\n\n" overview += "### Datasets per Language\n\n" for language, dataset_list in datasets.items(): collection = collections[language] overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n" return overview # Create the dataset information datasets = create_dataset_info() # Create/update collections for each language collections = create_update_collections(datasets) # Get the list of available languages languages = list(datasets.keys()) overview = """ This Space shows an overview of preference datasets, in particular DPO style datasets, available on the Hugging Face Hub across different languages Recently, [Odds Ratio Preference Optimization](https://huggingface.co/papers/2403.07691) ORPO has been demonstrated to be a powerful tool for training better performing language models directly from preference datasets. - ORPO can be done using DPO style datasets - Is a key ingredient for training better models for every language having enough DPO datasets for different languages? - This Space aims to track the number DPO datasets available on the Hugging Face Hub for different languages. """ dpo = """ #### What is Direct Preference Optimization (DPO)? DPO is a machine learning approach designed to optimize language models based on direct user preferences, bypassing the traditional reward modeling phase. It works by: 1. Calculating log probabilities of preferred and less preferred outputs from a language model. 2. Adjusting the model to maximize the likelihood of preferred outputs. This makes the optimization process simpler and potentially more effective by directly targeting what users deem desirable or preferable in language model responses. A DPO dataset includes three components: - **Input**: The input text or prompt that the language model receives. - **Chosen Output**: The output text that the user prefers. - **Rejected Output**: The output text that is less preferred by the user. #### What is ORPO? Odds Ratio Preference Optimization (ORPO) is a refinement that does not require a reference model for preference alignment. ORPO directly trains a language model without a SFT step, meaning you can do SFT and preference training in one stage. ORPO uses the same DPO datasets as DPO, but the training process is different. This means any DPO dataset can be used for ORPO training! Recently, Argilla, KAIST, and Hugging Face created [zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) using ORPO. This model is showing very strong performance whilst using only 70k examples for training. This suggests that ORPO could be a very powerful tool for training better performing open language models for many languages. The only thing that might be missing is enough DPO datasets for training these models. This space aims to track what DPO datasets are available for different languages and how many datasets are available for each language. """ adding_datasets = """ ## Adding a dataset To include a dataset in this Space, it should have either a `dpo` tag (or the dataset ID should contain `_dpo` or `dpo_`). Additionally, the dataset card should include language metadata. If you know of another dataset that should be included in this Space, please: 1. Add the `dpo` tag to the dataset 2. Include the language metadata in the dataset card 3. Open a discussion in this Space. I'll refresh the list of datasets to include it 🤗 """ faq = """ ## Frequently Asked Questions **Q: What is the difference between DPO and ORPO?** A: DPO and ORPO both use direct user preferences to optimize language models, but ORPO does not require a separate reference model for preference alignment. ORPO can perform supervised fine-tuning (SFT) and preference training in a single stage. **Q: Can I use DPO datasets for ORPO training?** A: Yes! Since ORPO uses the same dataset format as DPO, any DPO dataset can be used for ORPO training. **Q: How can I contribute to this Space?** A: If you know of a dataset that should be included, make sure it has the `dpo` tag or the appropriate dataset ID format, and include language metadata in the dataset card. Then, open a discussion in this Space to let me know about the dataset. """ with gr.Blocks() as demo: gr.HTML( "

🌍 DPO Datasets by Language 🗣️

", ) gr.Markdown(overview) overview = gr.Markdown(display_overview()) with gr.Row(): with gr.Column(): language_dropdown = gr.Dropdown(languages, label="Select Language") dataset_info = gr.Markdown() language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info) with gr.Accordion("More Information", open=False): gr.Markdown(dpo) gr.Markdown(adding_datasets) gr.Markdown(faq) demo.launch()