amir22010 commited on
Commit
b14edca
·
verified ·
1 Parent(s): 5a87964

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -6
app.py CHANGED
@@ -12,6 +12,7 @@ from huggingface_hub import hf_hub_download, list_repo_files
12
  from pydub import AudioSegment
13
  import io
14
  import tempfile
 
15
 
16
  #tts cpu model
17
  tts_model_str = "en_us_hifi_jets_cpu.addon"
@@ -64,6 +65,20 @@ def combine_audio_files(audio_files):
64
  os.remove(audio_file) # Remove temporary files
65
  return combined
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  #guardrail model
68
  guard_llm = "llama-3.1-8b-instant"
69
 
@@ -123,9 +138,7 @@ async def greet(product,description):
123
  audio_file = text_to_speech(a_list[0])
124
  audio_files.append(audio_file)
125
  final_audio = combine_audio_files(audio_files)
126
- output_file = "final_output.mp3"
127
- final_audio.export(output_file, format="mp3")
128
- yield final_audio
129
  else:
130
  audio_files = []
131
  output = llm.create_chat_completion(
@@ -147,9 +160,7 @@ async def greet(product,description):
147
  audio_file = text_to_speech(delta.get('content', ''))
148
  audio_files.append(audio_file)
149
  final_audio = combine_audio_files(audio_files)
150
- output_file = "final_output.mp3"
151
- final_audio.export(output_file, format="mp3")
152
- yield final_audio
153
 
154
  demo = gr.Interface(fn=greet, inputs=["text","text"], outputs=gr.Audio(), concurrency_limit=10)
155
  demo.launch()
 
12
  from pydub import AudioSegment
13
  import io
14
  import tempfile
15
+ import pydub
16
 
17
  #tts cpu model
18
  tts_model_str = "en_us_hifi_jets_cpu.addon"
 
65
  os.remove(audio_file) # Remove temporary files
66
  return combined
67
 
68
+ def postprocess(prediction_value):
69
+ if isinstance(prediction_value, pydub.AudioSegment):
70
+ # Convert AudioSegment to numpy array
71
+ samples = np.array(prediction_value.get_array_of_samples())
72
+ # If stereo, take the mean of both channels
73
+ if prediction_value.channels == 2:
74
+ samples = np.mean(samples.reshape(-1, 2), axis=1)
75
+ # Return as tuple (sample_rate, samples)
76
+ return (prediction_value.frame_rate, samples.astype(np.float32))
77
+ else:
78
+ raise ValueError(f"Cannot process {type(prediction_value)} as Audio")
79
+
80
+
81
+
82
  #guardrail model
83
  guard_llm = "llama-3.1-8b-instant"
84
 
 
138
  audio_file = text_to_speech(a_list[0])
139
  audio_files.append(audio_file)
140
  final_audio = combine_audio_files(audio_files)
141
+ yield postprocess(final_audio)
 
 
142
  else:
143
  audio_files = []
144
  output = llm.create_chat_completion(
 
160
  audio_file = text_to_speech(delta.get('content', ''))
161
  audio_files.append(audio_file)
162
  final_audio = combine_audio_files(audio_files)
163
+ yield postprocess(final_audio)
 
 
164
 
165
  demo = gr.Interface(fn=greet, inputs=["text","text"], outputs=gr.Audio(), concurrency_limit=10)
166
  demo.launch()