Spaces:

lenML
/

ChatTTS-Forge

Running on Zero

App Files Files Community

zhzluke96 commited on Jun 6, 2024

Commit

c5458aa

•

1 Parent(s): f34bda5

update

Browse files

Files changed (6) hide show

modules/api/impl/google_api.py +15 -0
modules/api/impl/openai_api.py +29 -25
modules/api/utils.py +0 -6
modules/devices/devices.py +1 -1
modules/utils/CsvMgr.py +1 -0
webui.py +3 -1

modules/api/impl/google_api.py CHANGED Viewed

@@ -11,6 +11,7 @@ from modules.utils.audio import apply_prosody_to_audio_data
 from modules.normalization import text_normalize
 from modules import generate_audio as generate
 from modules.ssml import parse_ssml
@@ -74,6 +75,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
     volume_gain_db = audioConfig.get("volumeGainDb", 0)
     batch_size = audioConfig.get("batchSize", 1)
     spliter_threshold = audioConfig.get("spliterThreshold", 100)
     # TODO sample_rate
@@ -84,6 +87,18 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
     # TODO maybe need to change the sample rate
     sample_rate = 24000
     try:
         if input.text:
             # 处理文本合成逻辑

 from modules.normalization import text_normalize
 from modules import generate_audio as generate
+from modules.speaker import speaker_mgr
 from modules.ssml import parse_ssml
     volume_gain_db = audioConfig.get("volumeGainDb", 0)
     batch_size = audioConfig.get("batchSize", 1)
+    # TODO spliter_threshold
     spliter_threshold = audioConfig.get("spliterThreshold", 100)
     # TODO sample_rate
     # TODO maybe need to change the sample rate
     sample_rate = 24000
+    # TODO 使用 speaker
+    spk = speaker_mgr.get_speaker(voice_name)
+    if spk is None:
+        raise HTTPException(
+            status_code=400, detail="The specified voice name is not supported."
+        )
+    if audio_format != "mp3" and audio_format != "wav":
+        raise HTTPException(
+            status_code=400, detail="Invalid audio encoding format specified."
+        )
     try:
         if input.text:
             # 处理文本合成逻辑

modules/api/impl/openai_api.py CHANGED Viewed

@@ -20,6 +20,9 @@ import pyrubberband as pyrb
 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
 import numpy as np
@@ -29,6 +32,8 @@ class AudioSpeechRequest(BaseModel):
     voice: str = "female2"
     response_format: Literal["mp3", "wav"] = "mp3"
     speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
     style: str = ""
     # 是否开启batch合成，小于等于1表示不适用batch
     # 开启batch合成会自动分割句子
@@ -43,20 +48,27 @@ async def openai_speech_api(
         ..., description="JSON body with model, input text, and voice"
     )
 ):
     try:
-        model = request.model
-        input_text = request.input
-        voice = request.voice
-        style = request.style
-        response_format = request.response_format
-        batch_size = request.batch_size
-        spliter_threshold = request.spliter_threshold
-        speed = request.speed
-        speed = clip(speed, 0.1, 10)
-        if not input_text:
-            raise HTTPException(status_code=400, detail="Input text is required.")
         # Normalize the text
         text = text_normalize(input_text, is_end=True)
@@ -112,7 +124,7 @@ class TranscribeSegment(BaseModel):
     start: float
     end: float
     text: str
-    tokens: List[int]
     temperature: float
     avg_logprob: float
     compression_ratio: float
@@ -124,7 +136,7 @@ class TranscriptionsVerboseResponse(BaseModel):
     language: str
     duration: float
     text: str
-    segments: List[TranscribeSegment]
 def setup(app: APIManager):
@@ -146,8 +158,8 @@ openai api document:
     @app.post(
         "/v1/audio/transcriptions",
-        response_class=TranscriptionsVerboseResponse,
-        description="WIP",
     )
     async def transcribe(
         file: UploadFile = File(...),
@@ -159,12 +171,4 @@ openai api document:
         timestamp_granularities: List[str] = Form(["segment"]),
     ):
         # TODO: Implement transcribe
-        return {
-            "file": file.filename,
-            "model": model,
-            "language": language,
-            "prompt": prompt,
-            "response_format": response_format,
-            "temperature": temperature,
-            "timestamp_granularities": timestamp_granularities,
-        }

 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
+from modules.speaker import speaker_mgr
+from modules.data import styles_mgr
 import numpy as np
     voice: str = "female2"
     response_format: Literal["mp3", "wav"] = "mp3"
     speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
+    seed: int = 42
+    temperature: float = 0.3
     style: str = ""
     # 是否开启batch合成，小于等于1表示不适用batch
     # 开启batch合成会自动分割句子
         ..., description="JSON body with model, input text, and voice"
     )
 ):
+    model = request.model
+    input_text = request.input
+    voice = request.voice
+    style = request.style
+    response_format = request.response_format
+    batch_size = request.batch_size
+    spliter_threshold = request.spliter_threshold
+    speed = request.speed
+    speed = clip(speed, 0.1, 10)
+    if not input_text:
+        raise HTTPException(status_code=400, detail="Input text is required.")
+    if speaker_mgr.get_speaker(voice) is None:
+        raise HTTPException(status_code=400, detail="Invalid voice.")
     try:
+        if style:
+            styles_mgr.find_item_by_name(style)
+    except:
+        raise HTTPException(status_code=400, detail="Invalid style.")
+    try:
         # Normalize the text
         text = text_normalize(input_text, is_end=True)
     start: float
     end: float
     text: str
+    tokens: list[int]
     temperature: float
     avg_logprob: float
     compression_ratio: float
     language: str
     duration: float
     text: str
+    segments: list[TranscribeSegment]
 def setup(app: APIManager):
     @app.post(
         "/v1/audio/transcriptions",
+        response_model=TranscriptionsVerboseResponse,
+        description="Transcribes audio into the input language.",
     )
     async def transcribe(
         file: UploadFile = File(...),
         timestamp_granularities: List[str] = Form(["segment"]),
     ):
         # TODO: Implement transcribe
+        return api_utils.success_response("not implemented yet")

modules/api/utils.py CHANGED Viewed

@@ -29,12 +29,6 @@ class BaseResponse(BaseModel):
     message: str
     data: Any
-    class Config:
-        json_encoders = {
-            torch.Tensor: lambda v: v.tolist(),
-            Speaker: lambda v: v.to_json(),
-        }
 def success_response(data: Any, message: str = "ok") -> BaseResponse:
     return BaseResponse(message=message, data=data)

     message: str
     data: Any
 def success_response(data: Any, message: str = "ok") -> BaseResponse:
     return BaseResponse(message=message, data=data)

modules/devices/devices.py CHANGED Viewed

@@ -74,7 +74,7 @@ def get_target_device_id_or_memory_available_gpu():
 def get_optimal_device_name():
-    if config.runtime_env_vars.use_cpu:
         return "cpu"
     if torch.cuda.is_available():

 def get_optimal_device_name():
+    if config.runtime_env_vars.use_cpu == "all":
         return "cpu"
     if torch.cuda.is_available():

modules/utils/CsvMgr.py CHANGED Viewed

@@ -15,6 +15,7 @@ class DataNotFoundError(Exception):
     pass
 class BaseManager:
     def __init__(self, csv_file):
         self.csv_file = csv_file

     pass
+# FIXME: 😓这个东西写的比较拉跨，最好找个什么csv库替代掉...
 class BaseManager:
     def __init__(self, csv_file):
         self.csv_file = csv_file

webui.py CHANGED Viewed

@@ -40,7 +40,7 @@ from modules.api.utils import calc_spk_style
 import modules.generate_audio as generate
 from modules.normalization import text_normalize
-from modules import refiner, config
 from modules.utils import env, audio
 from modules.SentenceSplitter import SentenceSplitter
@@ -101,6 +101,7 @@ def synthesize_ssml(ssml: str, batch_size=4):
     if len(segments) == 0:
         return None
     synthesize = SynthesizeSegments(batch_size=batch_size)
     audio_segments = synthesize.synthesize_segments(segments)
     combined_audio = combine_audio_segments(audio_segments)
@@ -157,6 +158,7 @@ def tts_generate(
     if not disable_normalize:
         text = text_normalize(text)
     sample_rate, audio_data = synthesize_audio(
         text=text,
         temperature=temperature,

 import modules.generate_audio as generate
 from modules.normalization import text_normalize
+from modules import refiner, config, models
 from modules.utils import env, audio
 from modules.SentenceSplitter import SentenceSplitter
     if len(segments) == 0:
         return None
+    models.load_chat_tts()
     synthesize = SynthesizeSegments(batch_size=batch_size)
     audio_segments = synthesize.synthesize_segments(segments)
     combined_audio = combine_audio_segments(audio_segments)
     if not disable_normalize:
         text = text_normalize(text)
+    models.load_chat_tts()
     sample_rate, audio_data = synthesize_audio(
         text=text,
         temperature=temperature,