arible
/

narrator

Inference Endpoints

Model card Files Files and versions Community

sim04ful commited on Apr 23

Commit

5e4143f

•

1 Parent(s): 1a6a41a

made narrator more dynamic

Browse files

Files changed (3) hide show

arible_schema.json +39 -2
handler.py +46 -7
requirements.txt +1 -0

arible_schema.json CHANGED Viewed

@@ -5,13 +5,50 @@
         {
             "name": "text",
             "type": "text",
-            "description": "Content of the QR code",
             "area": true,
             "options": {
                 "min": 100,
-                "max": 5000
             },
             "title": "Content"
         }
     ]
 }

         {
             "name": "text",
             "type": "text",
+            "description": "Text to be narrated",
             "area": true,
             "options": {
                 "min": 100,
+                "max": 50000
             },
             "title": "Content"
+        },
+        {
+            "name": "audio_urls",
+            "type": "constant",
+            "value": [
+                "https://pub-93685b189ac24b30839990a7d9a14391.r2.dev/attenborough_short.wav"
+            ]
+        },
+        {
+            "name": "gpt_cond_len",
+            "type": "constant",
+            "value": 30
+        },
+        {
+            "name": "gpt_cond_chunk_len",
+            "type": "constant",
+            "value": 4
+        },
+        {
+            "name": "max_ref_length",
+            "type": "constant",
+            "value": 16
+        },
+        {
+            "name": "temperature",
+            "type": "constant",
+            "value": 0.75
+        },
+        {
+            "name": "repetition_penalty",
+            "type": "constant",
+            "value": 2.5
+        },
+        {
+            "name": "language",
+            "type": "constant",
+            "value": "en"
         }
     ]
 }

handler.py CHANGED Viewed

@@ -11,6 +11,36 @@ import time
 import torchaudio
 import io
 import base64
 class EndpointHandler:
@@ -32,26 +62,28 @@ class EndpointHandler:
         self.model = model
     def __call__(self, model_input):
         (
             gpt_cond_latent,
             speaker_embedding,
         ) = self.model.get_conditioning_latents(
-            audio_path="/repository/attenborough.mp3",
-            gpt_cond_len=30,
-            gpt_cond_chunk_len=4,
-            max_ref_length=60,
         )
         print("Generating audio")
         t0 = time.time()
         out = self.model.inference(
             text=model_input["text"],
             speaker_embedding=speaker_embedding,
             gpt_cond_latent=gpt_cond_latent,
-            temperature=0.75,
-            repetition_penalty=2.5,
-            language="en",
             enable_text_splitting=True,
         )
         audio_file = io.BytesIO()
@@ -61,4 +93,11 @@ class EndpointHandler:
         inference_time = time.time() - t0
         print(f"I: Time to generate audio: {inference_time} seconds")
         audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8")
         return {"data": audio_str, "format": "wav"}

 import torchaudio
 import io
 import base64
+import requests
+import tempfile
+def convert_audio_urls_to_paths(audio_urls):
+    temp_files = []
+    audio_paths = []
+    for url in audio_urls:
+        filename = url.split("/")[-1]
+        file_destination_path, file_object = download_tempfile(
+            file_url=url, filename=filename
+        )
+        temp_files.append(file_object)
+        audio_paths.append(file_destination_path)
+    return audio_paths, temp_files
+def download_tempfile(file_url, filename):
+    try:
+        response = requests.get(file_url)
+        response.raise_for_status()
+        filetype = filename.split(".")[-1]
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{filetype}")
+        temp_file.write(response.content)
+        return temp_file.name, temp_file
+    except Exception as e:
+        print(f"Error downloading file: {e}")
+        return None, None
 class EndpointHandler:
         self.model = model
     def __call__(self, model_input):
+        audio_paths, temp_files = convert_audio_urls_to_paths(model_input["audio_urls"])
         (
             gpt_cond_latent,
             speaker_embedding,
         ) = self.model.get_conditioning_latents(
+            audio_path=audio_paths,
+            gpt_cond_len=model_input["gpt_cond_len"],
+            gpt_cond_chunk_len=model_input["gpt_cond_chunk_len"],
+            max_ref_length=model_input["max_ref_length"],
         )
         print("Generating audio")
         t0 = time.time()
         out = self.model.inference(
             text=model_input["text"],
             speaker_embedding=speaker_embedding,
             gpt_cond_latent=gpt_cond_latent,
+            temperature=model_input["temperature"],
+            repetition_penalty=model_input["repetition_penalty"],
+            language=model_input["language"],
             enable_text_splitting=True,
         )
         audio_file = io.BytesIO()
         inference_time = time.time() - t0
         print(f"I: Time to generate audio: {inference_time} seconds")
         audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8")
+        try:
+            for temp_file in temp_files:
+                os.remove(temp_file)
+        except Exception as e:
+            print(f"Error removing temp files: {e}")
         return {"data": audio_str, "format": "wav"}

requirements.txt CHANGED Viewed

@@ -3,5 +3,6 @@ torchvision
 torchaudio
 deepspeed
 coqui-tts
 # numpy
 # scipy

 torchaudio
 deepspeed
 coqui-tts
+requests
 # numpy
 # scipy