sim04ful commited on
Commit
5e4143f
1 Parent(s): 1a6a41a

made narrator more dynamic

Browse files
Files changed (3) hide show
  1. arible_schema.json +39 -2
  2. handler.py +46 -7
  3. requirements.txt +1 -0
arible_schema.json CHANGED
@@ -5,13 +5,50 @@
5
  {
6
  "name": "text",
7
  "type": "text",
8
- "description": "Content of the QR code",
9
  "area": true,
10
  "options": {
11
  "min": 100,
12
- "max": 5000
13
  },
14
  "title": "Content"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
  ]
17
  }
 
5
  {
6
  "name": "text",
7
  "type": "text",
8
+ "description": "Text to be narrated",
9
  "area": true,
10
  "options": {
11
  "min": 100,
12
+ "max": 50000
13
  },
14
  "title": "Content"
15
+ },
16
+ {
17
+ "name": "audio_urls",
18
+ "type": "constant",
19
+ "value": [
20
+ "https://pub-93685b189ac24b30839990a7d9a14391.r2.dev/attenborough_short.wav"
21
+ ]
22
+ },
23
+ {
24
+ "name": "gpt_cond_len",
25
+ "type": "constant",
26
+ "value": 30
27
+ },
28
+ {
29
+ "name": "gpt_cond_chunk_len",
30
+ "type": "constant",
31
+ "value": 4
32
+ },
33
+ {
34
+ "name": "max_ref_length",
35
+ "type": "constant",
36
+ "value": 16
37
+ },
38
+ {
39
+ "name": "temperature",
40
+ "type": "constant",
41
+ "value": 0.75
42
+ },
43
+ {
44
+ "name": "repetition_penalty",
45
+ "type": "constant",
46
+ "value": 2.5
47
+ },
48
+ {
49
+ "name": "language",
50
+ "type": "constant",
51
+ "value": "en"
52
  }
53
  ]
54
  }
handler.py CHANGED
@@ -11,6 +11,36 @@ import time
11
  import torchaudio
12
  import io
13
  import base64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  class EndpointHandler:
@@ -32,26 +62,28 @@ class EndpointHandler:
32
  self.model = model
33
 
34
  def __call__(self, model_input):
 
35
 
36
  (
37
  gpt_cond_latent,
38
  speaker_embedding,
39
  ) = self.model.get_conditioning_latents(
40
- audio_path="/repository/attenborough.mp3",
41
- gpt_cond_len=30,
42
- gpt_cond_chunk_len=4,
43
- max_ref_length=60,
44
  )
45
 
46
  print("Generating audio")
 
47
  t0 = time.time()
48
  out = self.model.inference(
49
  text=model_input["text"],
50
  speaker_embedding=speaker_embedding,
51
  gpt_cond_latent=gpt_cond_latent,
52
- temperature=0.75,
53
- repetition_penalty=2.5,
54
- language="en",
55
  enable_text_splitting=True,
56
  )
57
  audio_file = io.BytesIO()
@@ -61,4 +93,11 @@ class EndpointHandler:
61
  inference_time = time.time() - t0
62
  print(f"I: Time to generate audio: {inference_time} seconds")
63
  audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8")
 
 
 
 
 
 
 
64
  return {"data": audio_str, "format": "wav"}
 
11
  import torchaudio
12
  import io
13
  import base64
14
+ import requests
15
+ import tempfile
16
+
17
+
18
+ def convert_audio_urls_to_paths(audio_urls):
19
+ temp_files = []
20
+ audio_paths = []
21
+
22
+ for url in audio_urls:
23
+ filename = url.split("/")[-1]
24
+ file_destination_path, file_object = download_tempfile(
25
+ file_url=url, filename=filename
26
+ )
27
+ temp_files.append(file_object)
28
+ audio_paths.append(file_destination_path)
29
+
30
+ return audio_paths, temp_files
31
+
32
+
33
+ def download_tempfile(file_url, filename):
34
+ try:
35
+ response = requests.get(file_url)
36
+ response.raise_for_status()
37
+ filetype = filename.split(".")[-1]
38
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{filetype}")
39
+ temp_file.write(response.content)
40
+ return temp_file.name, temp_file
41
+ except Exception as e:
42
+ print(f"Error downloading file: {e}")
43
+ return None, None
44
 
45
 
46
  class EndpointHandler:
 
62
  self.model = model
63
 
64
  def __call__(self, model_input):
65
+ audio_paths, temp_files = convert_audio_urls_to_paths(model_input["audio_urls"])
66
 
67
  (
68
  gpt_cond_latent,
69
  speaker_embedding,
70
  ) = self.model.get_conditioning_latents(
71
+ audio_path=audio_paths,
72
+ gpt_cond_len=model_input["gpt_cond_len"],
73
+ gpt_cond_chunk_len=model_input["gpt_cond_chunk_len"],
74
+ max_ref_length=model_input["max_ref_length"],
75
  )
76
 
77
  print("Generating audio")
78
+
79
  t0 = time.time()
80
  out = self.model.inference(
81
  text=model_input["text"],
82
  speaker_embedding=speaker_embedding,
83
  gpt_cond_latent=gpt_cond_latent,
84
+ temperature=model_input["temperature"],
85
+ repetition_penalty=model_input["repetition_penalty"],
86
+ language=model_input["language"],
87
  enable_text_splitting=True,
88
  )
89
  audio_file = io.BytesIO()
 
93
  inference_time = time.time() - t0
94
  print(f"I: Time to generate audio: {inference_time} seconds")
95
  audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8")
96
+
97
+ try:
98
+ for temp_file in temp_files:
99
+ os.remove(temp_file)
100
+ except Exception as e:
101
+ print(f"Error removing temp files: {e}")
102
+
103
  return {"data": audio_str, "format": "wav"}
requirements.txt CHANGED
@@ -3,5 +3,6 @@ torchvision
3
  torchaudio
4
  deepspeed
5
  coqui-tts
 
6
  # numpy
7
  # scipy
 
3
  torchaudio
4
  deepspeed
5
  coqui-tts
6
+ requests
7
  # numpy
8
  # scipy