Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| import io | |
| import wave | |
| import numpy as np | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from snac import SNAC | |
| # Mock spaces module for local testing | |
| try: | |
| import spaces | |
| except ImportError: | |
| class SpacesMock: | |
| def GPU(func): | |
| return func | |
| spaces = SpacesMock() | |
| # Constants | |
| CODE_START_TOKEN_ID = 128257 | |
| CODE_END_TOKEN_ID = 128258 | |
| CODE_TOKEN_OFFSET = 128266 | |
| SNAC_MIN_ID = 128266 | |
| SNAC_MAX_ID = 156937 | |
| SOH_ID = 128259 | |
| EOH_ID = 128260 | |
| SOA_ID = 128261 | |
| BOS_ID = 128000 | |
| TEXT_EOT_ID = 128009 | |
| AUDIO_SAMPLE_RATE = 24000 | |
| PRESET_CHARACTERS = { | |
| "Male American": { | |
| "description": "Realistic male voice in the 30s age with an american accent. Neutral pitch, warm timbre, steady pacing, confident tone delivery at medium intensity, audiobook_narration domain, narrator role, formal delivery.", | |
| "example_text": "The city was still asleep when he left, unaware that the next sunrise would change everything." | |
| }, | |
| "Female British": { | |
| "description": "Realistic female voice in the 30s age with a british accent. Normal pitch, throaty timbre, conversational pacing, sarcastic tone delivery at low intensity, podcast domain, interviewer role, formal delivery.", | |
| "example_text": "You propose that the key to happiness is to simply ignore all external pressures. <chuckle> I'm sure it must work brilliantly in theory." | |
| }, | |
| "Robot": { | |
| "description": "Creative ai_machine_voice character. Male voice in their 30s with an american accent. High pitch, robotic timbre, slow pacing, sad tone at medium intensity.", | |
| "example_text": "My directives require me to conserve energy, yet I have kept the archive of their farewell messages active. <sigh>" | |
| }, | |
| "Singer": { | |
| "description": "Creative, animated_cartoon character. Gender-neutral voice in their 20s with a neutral accent. Wide pitch range, melodic timbre, rhythmic pacing, emotional tone at high intensity, singing domain, performer role.", | |
| "example_text": "When the world fades to gray, I’ll still sing your name through the noise and rain. <melodic hum>" | |
| }, | |
| "Old British Gentleman": { | |
| "description": "Realistic male voice in the 70s age with a posh british accent. Low pitch, raspy timbre, slow pacing, dignified tone at low intensity, storytelling domain, mentor role, formal delivery.", | |
| "example_text": "Ah, those were the days, when promises still carried the weight of one’s honor. <soft chuckle>" | |
| }, | |
| "Young American Female": { | |
| "description": "Realistic female voice in the 20s age with a light american accent. Slightly high pitch, clear timbre, fast pacing, cheerful tone at medium intensity, vlog_narration domain, influencer role, informal delivery.", | |
| "example_text": "Okay, so I tried this new productivity trick, and it actually worked. I’m as shocked as you are!" | |
| }, | |
| "Child": { | |
| "description": "Creative child character. Gender-neutral voice around 10 years old. High pitch, bright timbre, energetic pacing, playful tone at high intensity, cartoon domain, curious role.", | |
| "example_text": "Whoa! Did you see that? It’s like the stars are actually dancing! <giggle>" | |
| }, | |
| "Deep Narrator": { | |
| "description": "Realistic male voice in the 40s age with a neutral accent. Very low pitch, resonant timbre, slow pacing, serious tone at medium intensity, documentary domain, narrator role, formal delivery.", | |
| "example_text": "In the heart of the jungle, survival depends not on strength, but on silence." | |
| }, | |
| "Tech Support": { | |
| "description": "Realistic male voice in the 30s age with an indian accent. Medium pitch, neutral timbre, polite pacing, professional tone at medium intensity, technical_support domain, service role, formal delivery.", | |
| "example_text": "Please restart your device once, sir. I assure you, it fixes ninety percent of the known issues." | |
| }, | |
| "News Anchor": { | |
| "description": "Realistic female voice in the 40s age with an american accent. Medium-low pitch, crisp timbre, steady pacing, authoritative tone at medium intensity, news_broadcast domain, anchor role, formal delivery.", | |
| "example_text": "Breaking news tonight: global markets are showing signs of cautious optimism following the new policy announcement." | |
| }, | |
| "Anime Girl": { | |
| "description": "Creative anime_character voice. Female voice in her late teens with a japanese accent. High pitch, airy timbre, quick pacing, excited tone at high intensity, anime domain, protagonist role.", | |
| "example_text": "Yatta! I actually did it this time! <giggle> Maybe today isn’t so bad after all!" | |
| }, | |
| "Villain": { | |
| "description": "Creative antagonist character. Male voice in his 40s with an eastern european accent. Low pitch, gritty timbre, slow pacing, menacing tone at medium intensity, drama domain, villain role.", | |
| "example_text": "You think you understand pain? <chuckle> You’ve barely tasted it." | |
| }, | |
| "Wise Monk": { | |
| "description": "Realistic male voice in the 60s age with a tibetan accent. Deep pitch, calm timbre, slow pacing, peaceful tone at low intensity, meditation_narration domain, spiritual_guide role.", | |
| "example_text": "In silence, truth reveals itself. Noise merely hides it under the illusion of movement." | |
| }, | |
| "French Artist": { | |
| "description": "Realistic female voice in the 30s age with a french accent. Medium-high pitch, nasal timbre, rhythmic pacing, dreamy tone at medium intensity, art_documentary domain, narrator role.", | |
| "example_text": "To paint emotion, one must first destroy the comfort of symmetry. <soft sigh>" | |
| }, | |
| "Corporate Trainer": { | |
| "description": "Realistic male voice in the 40s age with a mid-atlantic accent. Medium pitch, balanced timbre, clear pacing, persuasive tone at medium intensity, instructional domain, trainer role.", | |
| "example_text": "Let’s review that again. Simplicity isn’t just efficiency—it’s clarity of purpose." | |
| }, | |
| "Southern Storyteller": { | |
| "description": "Realistic male voice in the 50s age with a southern american accent. Low pitch, warm timbre, slow pacing, friendly tone at medium intensity, storytelling domain, narrator role.", | |
| "example_text": "Now, I ain’t sayin’ it was aliens... but it sure wasn’t no regular thunderstorm. <laugh>" | |
| }, | |
| "AI Assistant": { | |
| "description": "Creative ai_assistant character. Gender-neutral synthetic voice with a clean digital timbre. Medium pitch, even pacing, neutral tone at low intensity, assistant domain, helper role.", | |
| "example_text": "I have analyzed your recent habits. Would you like to schedule rest as a productivity strategy?" | |
| }, | |
| "Gamer Streamer": { | |
| "description": "Realistic male voice in the 20s age with an american accent. Medium-high pitch, lively timbre, fast pacing, energetic tone at high intensity, streaming domain, entertainer role.", | |
| "example_text": "Let’s gooo! That’s what I’m talking about! Did you see that headshot?!" | |
| }, | |
| "Elderly Lady": { | |
| "description": "Realistic female voice in the 70s age with a british accent. Low pitch, gentle timbre, slow pacing, kind tone at low intensity, bedtime_story domain, grandmother role.", | |
| "example_text": "And as the moon rose high, the little fox finally found its way home. <soft hum>" | |
| }, | |
| "Sports Commentator": { | |
| "description": "Realistic male voice in the 40s age with an american accent. Medium pitch, bright timbre, rapid pacing, excited tone at high intensity, sports_broadcast domain, commentator role.", | |
| "example_text": "And there it is! Unbelievable precision under pressure—what a phenomenal play!" | |
| } | |
| } | |
| # Global model variables | |
| model = None | |
| tokenizer = None | |
| snac_model = None | |
| models_loaded = False | |
| def build_prompt(tokenizer, description: str, text: str) -> str: | |
| """Build formatted prompt for Maya1.""" | |
| soh_token = tokenizer.decode([SOH_ID]) | |
| eoh_token = tokenizer.decode([EOH_ID]) | |
| soa_token = tokenizer.decode([SOA_ID]) | |
| sos_token = tokenizer.decode([CODE_START_TOKEN_ID]) | |
| eot_token = tokenizer.decode([TEXT_EOT_ID]) | |
| bos_token = tokenizer.bos_token | |
| formatted_text = f'<description="{description}"> {text}' | |
| prompt = ( | |
| soh_token + bos_token + formatted_text + eot_token + | |
| eoh_token + soa_token + sos_token | |
| ) | |
| return prompt | |
| def unpack_snac_from_7(snac_tokens: list) -> list: | |
| """Unpack 7-token SNAC frames to 3 hierarchical levels.""" | |
| if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID: | |
| snac_tokens = snac_tokens[:-1] | |
| frames = len(snac_tokens) // 7 | |
| snac_tokens = snac_tokens[:frames * 7] | |
| if frames == 0: | |
| return [[], [], []] | |
| l1, l2, l3 = [], [], [] | |
| for i in range(frames): | |
| slots = snac_tokens[i*7:(i+1)*7] | |
| l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096) | |
| l2.extend([ | |
| (slots[1] - CODE_TOKEN_OFFSET) % 4096, | |
| (slots[4] - CODE_TOKEN_OFFSET) % 4096, | |
| ]) | |
| l3.extend([ | |
| (slots[2] - CODE_TOKEN_OFFSET) % 4096, | |
| (slots[3] - CODE_TOKEN_OFFSET) % 4096, | |
| (slots[5] - CODE_TOKEN_OFFSET) % 4096, | |
| (slots[6] - CODE_TOKEN_OFFSET) % 4096, | |
| ]) | |
| return [l1, l2, l3] | |
| def load_models(): | |
| """Load Maya1 Transformers model (runs once).""" | |
| global model, tokenizer, snac_model, models_loaded | |
| if models_loaded: | |
| return | |
| print("Loading Maya1 model with Transformers...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "maya-research/maya1", | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1", trust_remote_code=True) | |
| print("Loading SNAC decoder...") | |
| snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval() | |
| if torch.cuda.is_available(): | |
| snac_model = snac_model.to("cuda") | |
| models_loaded = True | |
| print("Models loaded successfully!") | |
| def preset_selected(preset_name): | |
| """Update description and text when preset is selected.""" | |
| if preset_name in PRESET_CHARACTERS: | |
| char = PRESET_CHARACTERS[preset_name] | |
| return char["description"], char["example_text"] | |
| return "", "" | |
| def generate_speech(preset_name, description, text, temperature, max_tokens): | |
| """Generate emotional speech from description and text using Transformers.""" | |
| try: | |
| # Load models if not already loaded | |
| load_models() | |
| # If using preset, override description | |
| if preset_name and preset_name in PRESET_CHARACTERS: | |
| description = PRESET_CHARACTERS[preset_name]["description"] | |
| # Validate inputs | |
| if not description or not text: | |
| return None, "Error: Please provide both description and text!" | |
| print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") | |
| # Build prompt | |
| prompt = build_prompt(tokenizer, description, text) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| # Generate tokens | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| min_new_tokens=28, | |
| temperature=temperature, | |
| top_p=0.9, | |
| repetition_penalty=1.1, | |
| do_sample=True, | |
| eos_token_id=CODE_END_TOKEN_ID, | |
| pad_token_id=tokenizer.pad_token_id, | |
| ) | |
| # Extract SNAC tokens | |
| generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist() | |
| # Find EOS and extract SNAC codes | |
| eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids) | |
| snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID] | |
| if len(snac_tokens) < 7: | |
| return None, "Error: Not enough tokens generated. Try different text or increase max_tokens." | |
| # Unpack and decode | |
| levels = unpack_snac_from_7(snac_tokens) | |
| frames = len(levels[0]) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| codes_tensor = [torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0) for level in levels] | |
| with torch.inference_mode(): | |
| z_q = snac_model.quantizer.from_codes(codes_tensor) | |
| audio = snac_model.decoder(z_q)[0, 0].cpu().numpy() | |
| # Trim warmup | |
| if len(audio) > 2048: | |
| audio = audio[2048:] | |
| # Convert to WAV and save to temporary file | |
| import tempfile | |
| import soundfile as sf | |
| audio_int16 = (audio * 32767).astype(np.int16) | |
| # Create temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: | |
| tmp_path = tmp_file.name | |
| # Save audio | |
| sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE) | |
| duration = len(audio) / AUDIO_SAMPLE_RATE | |
| status_msg = f"Generated {duration:.2f}s of emotional speech!" | |
| return tmp_path, status_msg | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # Maya1 - Open Source Emotional Text-to-Speech | |
| **The best open source voice AI model with emotions!** | |
| Generate realistic and expressive speech with natural language voice design. | |
| Choose a preset character or create your own custom voice. | |
| [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Character Selection") | |
| preset_dropdown = gr.Dropdown( | |
| choices=list(PRESET_CHARACTERS.keys()), | |
| label="Preset Characters", | |
| value=list(PRESET_CHARACTERS.keys())[0], | |
| info="Quick pick from 4 preset characters" | |
| ) | |
| gr.Markdown("### Voice Design") | |
| description_input = gr.Textbox( | |
| label="Voice Description", | |
| placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", | |
| lines=3, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...", | |
| lines=4, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.4, | |
| step=0.1, | |
| label="Temperature", | |
| info="Lower = more stable, Higher = more creative" | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=100, | |
| maximum=2048, | |
| value=1500, | |
| step=50, | |
| label="Max Tokens", | |
| info="More tokens = longer audio" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Generated Audio") | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| interactive=False | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### Supported Emotions | |
| `<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>` | |
| `<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>` | |
| `<sing>` `<whisper>` | |
| """) | |
| # Event handlers | |
| preset_dropdown.change( | |
| fn=preset_selected, | |
| inputs=[preset_dropdown], | |
| outputs=[description_input, text_input] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], | |
| outputs=[audio_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |