Spaces:

BricksDisplay
/

OuteTTS-Speaker-Creator

Running on Zero

App Files Files Community

hans00 commited on 20 days ago

Commit

86b23e4

unverified ·

1 Parent(s): 216ef9a

Use grammar to avoid generation error

Browse files

Files changed (1) hide show

app.py +83 -7

app.py CHANGED Viewed

@@ -6,13 +6,18 @@ import json
 import tempfile
 import hashlib
 import os
 from typing import Optional
 from outetts.models.info import MODEL_INFO
 from outetts.utils import helpers
 from huggingface_hub import hf_hub_download
 import torch
 from transformers import BitsAndBytesConfig
 import spaces
 # Available OuteTTS models based on the documentation
 MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}
@@ -26,6 +31,77 @@ MODEL_QUANTIZATION = {
 # Cache for speaker profiles to avoid re-transcribing the same audio
 speaker_cache = {}
 def get_file_hash(file_path):
     """Calculate MD5 hash of a file for caching purposes."""
     hash_md5 = hashlib.md5()
@@ -34,7 +110,7 @@ def get_file_hash(file_path):
             hash_md5.update(chunk)
     return hash_md5.hexdigest()
-def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization):
     model_config = MODEL_INFO[model]
     repo = f"OuteAI/{model.value}-GGUF"
     filename = f"{model.value}-{quantization.value}.gguf"
@@ -45,12 +121,12 @@ def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization
         local_files_only=False
     )
     generation_type = outetts.GenerationType.CHUNKED
-    if model_config['interface_version'] == outetts.InterfaceVersion.V3:
-        generation_type = outetts.GenerationType.GUIDED_WORDS
     return outetts.ModelConfig(
         model_path=model_path,
         tokenizer_path=f"OuteAI/{model.value}",
-        backend=backend,
         n_gpu_layers=99,
         verbose=False,
         device=None,
@@ -67,7 +143,7 @@ def get_interface(model_name: str):
     try:
         quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
-        config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization)
     except:
         has_cuda = torch.cuda.is_available()
         model_config = MODEL_INFO[model]
@@ -98,7 +174,7 @@ def get_or_create_speaker(interface, audio_file):
     # Check if speaker profile is already cached
     if cache_key in speaker_cache:
         print(f"✅ Using cached speaker profile for {os.path.basename(audio_file)}")
-        return speaker_cache[cache_key]
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -108,7 +184,7 @@ def get_or_create_speaker(interface, audio_file):
         speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)
         # Cache the speaker profile
-        speaker_cache[cache_key] = speaker
         print(f"💾 Cached speaker profile ({len(speaker_cache)} total cached)")
         return speaker

 import tempfile
 import hashlib
 import os
+import re
 from typing import Optional
+from llama_cpp.llama import LlamaGrammar
+from outetts.version.interface import InterfaceLLAMACPP
 from outetts.models.info import MODEL_INFO
 from outetts.utils import helpers
 from huggingface_hub import hf_hub_download
 import torch
 from transformers import BitsAndBytesConfig
 import spaces
+import numpy as np
+from collections import OrderedDict
 # Available OuteTTS models based on the documentation
 MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}
 # Cache for speaker profiles to avoid re-transcribing the same audio
 speaker_cache = {}
+SPLIT_SYMBOL = {
+    outetts.InterfaceVersion.V1: '<|space|>',
+    outetts.InterfaceVersion.V2: '<|space|>',
+    outetts.InterfaceVersion.V3: ' ',
+}
+def word_to_grammar(word):
+    if all(ord(c) < 128 for c in word):
+        return f'"{word}"'
+    return f'[{"".join(OrderedDict.fromkeys(word))}]+'
+# patch InterfaceLLAMACPP, inject new _generate method
+InterfaceLLAMACPP._orig_generate = InterfaceLLAMACPP._generate
+def ggml_generate(self, input_ids, config):
+    tokenizer = self.prompt_processor.tokenizer
+    split = SPLIT_SYMBOL.get(self.config.interface_version, ' ')
+    prompt = tokenizer.decode(input_ids, skip_special_tokens=False)
+    prompt_no_special = tokenizer.decode(input_ids, skip_special_tokens=True).strip()
+    if '<|text_start|>' not in prompt:
+        return self._orig_generate(input_ids, config)
+    speaker_text_last = prompt_no_special.split('\n').pop()
+    text = prompt[prompt.index('<|text_start|>')+14:prompt.index('<|text_end|>')]
+    gen_text = text[text.index(speaker_text_last)+len(speaker_text_last):].strip(split) if speaker_text_last in text else text
+    words = [word_to_grammar(word) for word in gen_text.split(split)]
+    if self.config.interface_version == outetts.InterfaceVersion.V2:
+        config.additional_gen_config["grammar"] = LlamaGrammar.from_string(f"""\
+root       ::= NL? {' audioBlock '.join(words)} audioEnd NL EOS?
+audioBlock ::= TIME CODE* space NL?
+TEXT ::= [A-Za-z0-9 .,?!]+
+EOS      ::= "<|im_end|>"
+emotionStart ::= "<|emotion_start|>"
+emotionEnd ::= "<|emotion_end|>"
+audioEnd   ::= "<|audio_end|>"
+space      ::= "<|space|>"
+WORD       ::= {' | '.join(words)}
+NL         ::= [\\n]
+TIME  ::= "<|t_" DECIMAL "|>"
+CODE    ::= "<|" DIGITS "|>"
+DIGITS     ::= [0-9]+
+DECIMAL    ::= [0-9]+ "." [0-9]+
+punch ::= "<|" [a-z_]+ "|>"
+""")
+    elif self.config.interface_version == outetts.InterfaceVersion.V3:
+        config.additional_gen_config["grammar"] = LlamaGrammar.from_string(f"""\
+root       ::= leadWord wordBlock* audioEnd NL EOS?
+leadWord ::= WORD audioBlock
+wordBlock ::= wordStart WORD audioBlock
+audioBlock ::= codeBlock wordEnd NL?
+codeBlock ::= features TIME energy spectralCentroid pitch CODE CODES*
+TEXT ::= [A-Za-z0-9.,!?]+
+EOS      ::= "<|im_end|>"
+audioEnd   ::= "<|audio_end|>"
+wordStart ::= "<|word_start|>"
+wordEnd ::= "<|word_end|>"
+features ::= "<|features|>"
+energy ::= "<|energy_" DIGITS "|>"
+spectralCentroid ::= "<|spectral_centroid_" DIGITS "|>"
+pitch ::= "<|pitch_" DIGITS "|>"
+WORD       ::= {' | '.join(words)}
+NL         ::= [\\n]
+TIME       ::= "<|t_" DECIMAL "|>"
+CODE       ::= "<|code|>"
+CODES      ::= CODE1 CODE2
+CODE1      ::= "<|c1_" DIGITS "|>"
+CODE2      ::= "<|c2_" DIGITS "|>"
+DIGITS     ::= [0-9]+
+DECIMAL    ::= [0-9]+ "." [0-9]+
+""")
+    return self._orig_generate(input_ids, config)
+InterfaceLLAMACPP._generate = ggml_generate
 def get_file_hash(file_path):
     """Calculate MD5 hash of a file for caching purposes."""
     hash_md5 = hashlib.md5()
             hash_md5.update(chunk)
     return hash_md5.hexdigest()
+def try_ggml_model(model: outetts.Models, quantization: outetts.LlamaCppQuantization):
     model_config = MODEL_INFO[model]
     repo = f"OuteAI/{model.value}-GGUF"
     filename = f"{model.value}-{quantization.value}.gguf"
         local_files_only=False
     )
     generation_type = outetts.GenerationType.CHUNKED
+    # if model_config['interface_version'] == outetts.InterfaceVersion.V3:
+    #     generation_type = outetts.GenerationType.GUIDED_WORDS
     return outetts.ModelConfig(
         model_path=model_path,
         tokenizer_path=f"OuteAI/{model.value}",
+        backend=outetts.Backend.LLAMACPP,
         n_gpu_layers=99,
         verbose=False,
         device=None,
     try:
         quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
+        config = try_ggml_model(model, quantization)
     except:
         has_cuda = torch.cuda.is_available()
         model_config = MODEL_INFO[model]
     # Check if speaker profile is already cached
     if cache_key in speaker_cache:
         print(f"✅ Using cached speaker profile for {os.path.basename(audio_file)}")
+        return json.loads(speaker_cache[cache_key])
     device = "cuda" if torch.cuda.is_available() else "cpu"
         speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)
         # Cache the speaker profile
+        speaker_cache[cache_key] = json.dumps(speaker)
         print(f"💾 Cached speaker profile ({len(speaker_cache)} total cached)")
         return speaker