Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -1,199 +1,109 @@ | |
| 1 | 
             
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2 | 
             
            library_name: transformers
         | 
| 3 | 
            -
            tags: | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 4 | 
             
            ---
         | 
| 5 |  | 
| 6 | 
            -
            #  | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
            - ** | 
| 21 | 
            -
            -  | 
| 22 | 
            -
            -  | 
| 23 | 
            -
            - ** | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
            - ** | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
            <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
         | 
| 100 | 
            -
             | 
| 101 | 
            -
            [More Information Needed]
         | 
| 102 | 
            -
             | 
| 103 | 
            -
            ## Evaluation
         | 
| 104 | 
            -
             | 
| 105 | 
            -
            <!-- This section describes the evaluation protocols and provides the results. -->
         | 
| 106 | 
            -
             | 
| 107 | 
            -
            ### Testing Data, Factors & Metrics
         | 
| 108 | 
            -
             | 
| 109 | 
            -
            #### Testing Data
         | 
| 110 | 
            -
             | 
| 111 | 
            -
            <!-- This should link to a Dataset Card if possible. -->
         | 
| 112 | 
            -
             | 
| 113 | 
            -
            [More Information Needed]
         | 
| 114 | 
            -
             | 
| 115 | 
            -
            #### Factors
         | 
| 116 | 
            -
             | 
| 117 | 
            -
            <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
         | 
| 118 | 
            -
             | 
| 119 | 
            -
            [More Information Needed]
         | 
| 120 | 
            -
             | 
| 121 | 
            -
            #### Metrics
         | 
| 122 | 
            -
             | 
| 123 | 
            -
            <!-- These are the evaluation metrics being used, ideally with a description of why. -->
         | 
| 124 | 
            -
             | 
| 125 | 
            -
            [More Information Needed]
         | 
| 126 | 
            -
             | 
| 127 | 
            -
            ### Results
         | 
| 128 | 
            -
             | 
| 129 | 
            -
            [More Information Needed]
         | 
| 130 | 
            -
             | 
| 131 | 
            -
            #### Summary
         | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
             | 
| 135 | 
            -
            ## Model Examination [optional]
         | 
| 136 | 
            -
             | 
| 137 | 
            -
            <!-- Relevant interpretability work for the model goes here -->
         | 
| 138 | 
            -
             | 
| 139 | 
            -
            [More Information Needed]
         | 
| 140 | 
            -
             | 
| 141 | 
            -
            ## Environmental Impact
         | 
| 142 | 
            -
             | 
| 143 | 
            -
            <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
         | 
| 144 | 
            -
             | 
| 145 | 
            -
            Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
         | 
| 146 | 
            -
             | 
| 147 | 
            -
            - **Hardware Type:** [More Information Needed]
         | 
| 148 | 
            -
            - **Hours used:** [More Information Needed]
         | 
| 149 | 
            -
            - **Cloud Provider:** [More Information Needed]
         | 
| 150 | 
            -
            - **Compute Region:** [More Information Needed]
         | 
| 151 | 
            -
            - **Carbon Emitted:** [More Information Needed]
         | 
| 152 | 
            -
             | 
| 153 | 
            -
            ## Technical Specifications [optional]
         | 
| 154 | 
            -
             | 
| 155 | 
            -
            ### Model Architecture and Objective
         | 
| 156 | 
            -
             | 
| 157 | 
            -
            [More Information Needed]
         | 
| 158 | 
            -
             | 
| 159 | 
            -
            ### Compute Infrastructure
         | 
| 160 | 
            -
             | 
| 161 | 
            -
            [More Information Needed]
         | 
| 162 | 
            -
             | 
| 163 | 
            -
            #### Hardware
         | 
| 164 | 
            -
             | 
| 165 | 
            -
            [More Information Needed]
         | 
| 166 | 
            -
             | 
| 167 | 
            -
            #### Software
         | 
| 168 | 
            -
             | 
| 169 | 
            -
            [More Information Needed]
         | 
| 170 | 
            -
             | 
| 171 | 
            -
            ## Citation [optional]
         | 
| 172 | 
            -
             | 
| 173 | 
            -
            <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
         | 
| 174 | 
            -
             | 
| 175 | 
            -
            **BibTeX:**
         | 
| 176 | 
            -
             | 
| 177 | 
            -
            [More Information Needed]
         | 
| 178 | 
            -
             | 
| 179 | 
            -
            **APA:**
         | 
| 180 | 
            -
             | 
| 181 | 
            -
            [More Information Needed]
         | 
| 182 | 
            -
             | 
| 183 | 
            -
            ## Glossary [optional]
         | 
| 184 | 
            -
             | 
| 185 | 
            -
            <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
         | 
| 186 | 
            -
             | 
| 187 | 
            -
            [More Information Needed]
         | 
| 188 | 
            -
             | 
| 189 | 
            -
            ## More Information [optional]
         | 
| 190 | 
            -
             | 
| 191 | 
            -
            [More Information Needed]
         | 
| 192 | 
            -
             | 
| 193 | 
            -
            ## Model Card Authors [optional]
         | 
| 194 | 
            -
             | 
| 195 | 
            -
            [More Information Needed]
         | 
| 196 | 
            -
             | 
| 197 | 
            -
            ## Model Card Contact
         | 
| 198 | 
            -
             | 
| 199 | 
            -
            [More Information Needed]
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            license: mit
         | 
| 3 | 
            +
            language:
         | 
| 4 | 
            +
            - en
         | 
| 5 | 
            +
            base_model:
         | 
| 6 | 
            +
            - mistralai/Mistral-7B-v0.1
         | 
| 7 | 
            +
            - google/gemma-7b
         | 
| 8 | 
             
            library_name: transformers
         | 
| 9 | 
            +
            tags:
         | 
| 10 | 
            +
            - mergekit
         | 
| 11 | 
            +
            - merged-model
         | 
| 12 | 
            +
            - mistral
         | 
| 13 | 
            +
            - gemma
         | 
| 14 | 
            +
            - language-model
         | 
| 15 | 
             
            ---
         | 
| 16 |  | 
| 17 | 
            +
            # π MistralGemma-Hybrid-7B: A Fusion of Power & Precision
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ## π Overview
         | 
| 20 | 
            +
            **MistralGemma-Hybrid-7B** is an **experimental hybrid language model** that blends the strengths of **Mistral-7B** and **Gemma-7B** using the **Spherical Linear Interpolation (slerp) merging technique**. Designed to optimize both efficiency and performance, this model offers robust text generation capabilities while leveraging the advantages of both parent models.
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            π **Created by**: [Matteo Khan]  
         | 
| 23 | 
            +
            π **Affiliation**: Apprentice at TW3 Partners (Generative AI Research)  
         | 
| 24 | 
            +
            π **License**: MIT  
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            π [Connect with me on LinkedIn](https://www.linkedin.com/in/matteo-khan-a10309263/)  
         | 
| 27 | 
            +
            π [Model on Hugging Face](https://huggingface.co/YourProfile/MistralGemma-Hybrid-7B)  
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            ## π§  Model Details
         | 
| 30 | 
            +
            - **Model Type**: Hybrid Language Model (Merged)
         | 
| 31 | 
            +
            - **Parent Models**:
         | 
| 32 | 
            +
              - [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
         | 
| 33 | 
            +
              - [Gemma-7B](https://huggingface.co/google/gemma-7b)
         | 
| 34 | 
            +
            - **Merging Technique**: Slerp Merge (MergeKit)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            ## π― Intended Use
         | 
| 37 | 
            +
            This model is intended for **research and experimentation** in hybrid model optimization. Potential applications include:
         | 
| 38 | 
            +
            - β
 Text Generation
         | 
| 39 | 
            +
            - β
 Conversational AI
         | 
| 40 | 
            +
            - β
 Creative Writing Assistance
         | 
| 41 | 
            +
            - β
 Exploration of Model Merging Effects
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            ## β οΈ Limitations & Considerations
         | 
| 44 | 
            +
            While **MistralGemma-Hybrid-7B** offers enhanced capabilities, it also inherits limitations from its parent models:
         | 
| 45 | 
            +
            - β May generate **inaccurate or misleading** information
         | 
| 46 | 
            +
            - β οΈ Potential for **biased, offensive, or harmful** content
         | 
| 47 | 
            +
            - π Merging may introduce **unpredictable behaviors**
         | 
| 48 | 
            +
            - π Performance may **vary across different tasks**
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ## π¬ Merging Process & Configuration
         | 
| 51 | 
            +
            This is **not a newly trained model**, but rather a merge of existing models using the following configuration:
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            ```yaml
         | 
| 54 | 
            +
            merge_method: slerp  # Using slerp instead of linear
         | 
| 55 | 
            +
            dtype: float16
         | 
| 56 | 
            +
            models:
         | 
| 57 | 
            +
              - model: "mistralai/Mistral-7B-v0.1"
         | 
| 58 | 
            +
                parameters:
         | 
| 59 | 
            +
                  weight: 0.5
         | 
| 60 | 
            +
              - model: "google/gemma-7b"
         | 
| 61 | 
            +
                parameters:
         | 
| 62 | 
            +
                  weight: 0.5
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            parameters:
         | 
| 65 | 
            +
              normalize: true
         | 
| 66 | 
            +
              int8_mask: false
         | 
| 67 | 
            +
              rescale: true  # Helps with different model scales
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            layers:
         | 
| 70 | 
            +
              - pattern: ".*"
         | 
| 71 | 
            +
                layer_range: [0, -1]
         | 
| 72 | 
            +
            ```
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            π **No formal evaluation** has been conducted yet. Users are encouraged to **benchmark and share feedback**!
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            ## π Environmental Impact
         | 
| 77 | 
            +
            By utilizing **model merging** rather than training from scratch, **MistralGemma-Hybrid-7B** significantly reduces computational and environmental costs.
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            ## π How to Use
         | 
| 80 | 
            +
            ```python
         | 
| 81 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            model_name = "YourProfile/MistralGemma-Hybrid-7B"
         | 
| 84 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(model_name)
         | 
| 85 | 
            +
            model = AutoModelForCausalLM.from_pretrained(model_name)
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            # Example usage
         | 
| 88 | 
            +
            prompt = "Write a short story about the future of AI."
         | 
| 89 | 
            +
            inputs = tokenizer(prompt, return_tensors="pt")
         | 
| 90 | 
            +
            outputs = model.generate(**inputs, max_length=200)
         | 
| 91 | 
            +
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         | 
| 92 | 
            +
            print(response)
         | 
| 93 | 
            +
            ```
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            **π Citation**
         | 
| 96 | 
            +
            ```bibtex
         | 
| 97 | 
            +
            @misc{mistralgemma2025,
         | 
| 98 | 
            +
                  title={MistralGemma: A Hybrid Open-Source Language Model},
         | 
| 99 | 
            +
                  author={Your Name},
         | 
| 100 | 
            +
                  year={2025},
         | 
| 101 | 
            +
                  eprint={arXiv:XXXX.XXXXX},
         | 
| 102 | 
            +
                  archivePrefix={arXiv},
         | 
| 103 | 
            +
                  primaryClass={cs.CL}
         | 
| 104 | 
            +
            }
         | 
| 105 | 
            +
            ```
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            π© **Feedback & Contact**: Reach out via [Hugging Face](https://huggingface.co/YourProfile).
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            π **Happy Experimenting!** π
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  |