Kuberwastaken commited on
Commit
8c1e484
·
1 Parent(s): 348e4cd

Switched to MeloTTS

Browse files
Files changed (2) hide show
  1. app.py +24 -35
  2. requirements.txt +4 -1
app.py CHANGED
@@ -7,6 +7,9 @@ import numpy as np
7
  from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
9
  import time
 
 
 
10
 
11
  from transformers import BlipProcessor, BlipForConditionalGeneration
12
 
@@ -90,59 +93,43 @@ def generate_roast(caption, llm_components):
90
  return response
91
 
92
  def initialize_tts_model():
93
- tts_pipeline = pipeline(
94
- "text-to-speech",
95
- model="parler-tts/parler-tts-mini-expresso"
96
- )
97
- return tts_pipeline
98
 
99
- def text_to_speech(text, tts_pipeline):
100
- # Additional prompt to guide the voice style
101
- styled_text = f"[[voice:female_mature]] [[speed:0.9]] [[precision:0.8]] {text}"
102
-
103
- speech = tts_pipeline(styled_text)
104
- return (speech["sampling_rate"], speech["audio"])
105
 
106
- def process_frame(image, vision_components, llm_components, tts_pipeline):
107
- # Step 1: Analyze what's in the image
108
  caption = analyze_image(image, vision_components)
109
-
110
- # Step 2: Generate roast based on the caption
111
  roast = generate_roast(caption, llm_components)
112
-
113
- # Step 3: Convert roast to speech
114
- audio = text_to_speech(roast, tts_pipeline)
115
-
116
  return caption, roast, audio
117
 
118
  def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
119
- # Initialize all models
120
  vision_components = initialize_vision_model()
121
  llm_components = initialize_llm()
122
- tts_pipeline = initialize_tts_model()
123
-
124
- last_process_time = time.time() - 10 # Initialize with an offset
125
- processing_interval = 5 # Process every 5 seconds
126
-
127
  def process_webcam(image):
128
  nonlocal last_process_time
129
-
130
  current_time = time.time()
131
  if current_time - last_process_time >= processing_interval and image is not None:
132
  last_process_time = current_time
133
-
134
  caption, roast, audio = process_frame(
135
- image,
136
- vision_components,
137
- llm_components,
138
- tts_pipeline
 
139
  )
140
-
141
  return image, caption, roast, audio
142
-
143
- # Return None for outputs that shouldn't update
144
  return image, None, None, None
145
-
146
  video_feed.change(
147
  process_webcam,
148
  inputs=[video_feed],
@@ -169,5 +156,7 @@ def create_app():
169
  return app
170
 
171
  if __name__ == "__main__":
 
 
172
  app = create_app()
173
  app.launch()
 
7
  from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
9
  import time
10
+ import nltk
11
+ from melo.api import TTS
12
+ import io
13
 
14
  from transformers import BlipProcessor, BlipForConditionalGeneration
15
 
 
93
  return response
94
 
95
  def initialize_tts_model():
96
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
97
+ tts_model = TTS(language='EN', device=device)
98
+ speaker_ids = tts_model.hps.data.spk2id
99
+ return tts_model, speaker_ids
 
100
 
101
+ def text_to_speech(text, tts_model, speaker_id='EN-US', speed=1.0):
102
+ bio = io.BytesIO()
103
+ tts_model.tts_to_file(text, tts_model.hps.data.spk2id[speaker_id], bio, speed=speed, format='wav')
104
+ bio.seek(0)
105
+ return (24000, bio.read())
 
106
 
107
+ def process_frame(image, vision_components, llm_components, tts_model, speaker_id='EN-US'):
 
108
  caption = analyze_image(image, vision_components)
 
 
109
  roast = generate_roast(caption, llm_components)
110
+ audio = text_to_speech(roast, tts_model, speaker_id)
 
 
 
111
  return caption, roast, audio
112
 
113
  def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
 
114
  vision_components = initialize_vision_model()
115
  llm_components = initialize_llm()
116
+ tts_model, speaker_ids = initialize_tts_model()
117
+ last_process_time = time.time() - 10
118
+ processing_interval = 5
 
 
119
  def process_webcam(image):
120
  nonlocal last_process_time
 
121
  current_time = time.time()
122
  if current_time - last_process_time >= processing_interval and image is not None:
123
  last_process_time = current_time
 
124
  caption, roast, audio = process_frame(
125
+ image,
126
+ vision_components,
127
+ llm_components,
128
+ tts_model,
129
+ 'EN-US' # Default accent
130
  )
 
131
  return image, caption, roast, audio
 
 
132
  return image, None, None, None
 
133
  video_feed.change(
134
  process_webcam,
135
  inputs=[video_feed],
 
156
  return app
157
 
158
  if __name__ == "__main__":
159
+ os.system('python -m unidic download')
160
+ nltk.download('averaged_perceptron_tagger_eng')
161
  app = create_app()
162
  app.launch()
requirements.txt CHANGED
@@ -6,4 +6,7 @@ pillow
6
  numpy
7
  accelerate
8
  git+https://github.com/huggingface/diffusers
9
- opencv-python
 
 
 
 
6
  numpy
7
  accelerate
8
  git+https://github.com/huggingface/diffusers
9
+ opencv-python
10
+ melo-tts
11
+ nltk
12
+ unidic-lite