base_model: qwen_chat | |
gate_mode: hidden | |
dtype: bfloat16 | |
experts: | |
- source_model: qwen_math | |
positive_prompts: | |
- "Solve the equation" | |
- "Derive the formula" | |
- "Given the value x, solve for y" | |
- "Find a function that models this" | |
- "Find the integral of the function" | |
- "Find the first order derivative" | |
- "What is the answer to this math question" | |
- source_model: qwen_code | |
positive_prompts: | |
- "Write a python program" | |
- "Write a java program" | |
- "Write a C++ program" | |
- "Create a quicksort program" | |
- "Implement a library that does" | |
- "How can I do this in Python" | |
- "How can I do this in Java" | |
- "How can I do this in C++" | |
- "How can I do this in Javascript" | |
- "Create a website with HTML" | |
shared_experts: | |
- source_model: qwen_chat | |
positive_prompts: | |
- "Hello, who are you?" | |
- "I need help with" | |
- "Can you explain" | |
- "Help me with this" | |
residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model |