Spaces:
Sleeping
Sleeping
Commit
·
8c1e484
1
Parent(s):
348e4cd
Switched to MeloTTS
Browse files- app.py +24 -35
- requirements.txt +4 -1
app.py
CHANGED
|
@@ -7,6 +7,9 @@ import numpy as np
|
|
| 7 |
from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
| 9 |
import time
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
|
|
@@ -90,59 +93,43 @@ def generate_roast(caption, llm_components):
|
|
| 90 |
return response
|
| 91 |
|
| 92 |
def initialize_tts_model():
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
return tts_pipeline
|
| 98 |
|
| 99 |
-
def text_to_speech(text,
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
return (speech["sampling_rate"], speech["audio"])
|
| 105 |
|
| 106 |
-
def process_frame(image, vision_components, llm_components,
|
| 107 |
-
# Step 1: Analyze what's in the image
|
| 108 |
caption = analyze_image(image, vision_components)
|
| 109 |
-
|
| 110 |
-
# Step 2: Generate roast based on the caption
|
| 111 |
roast = generate_roast(caption, llm_components)
|
| 112 |
-
|
| 113 |
-
# Step 3: Convert roast to speech
|
| 114 |
-
audio = text_to_speech(roast, tts_pipeline)
|
| 115 |
-
|
| 116 |
return caption, roast, audio
|
| 117 |
|
| 118 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
| 119 |
-
# Initialize all models
|
| 120 |
vision_components = initialize_vision_model()
|
| 121 |
llm_components = initialize_llm()
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
processing_interval = 5 # Process every 5 seconds
|
| 126 |
-
|
| 127 |
def process_webcam(image):
|
| 128 |
nonlocal last_process_time
|
| 129 |
-
|
| 130 |
current_time = time.time()
|
| 131 |
if current_time - last_process_time >= processing_interval and image is not None:
|
| 132 |
last_process_time = current_time
|
| 133 |
-
|
| 134 |
caption, roast, audio = process_frame(
|
| 135 |
-
image,
|
| 136 |
-
vision_components,
|
| 137 |
-
llm_components,
|
| 138 |
-
|
|
|
|
| 139 |
)
|
| 140 |
-
|
| 141 |
return image, caption, roast, audio
|
| 142 |
-
|
| 143 |
-
# Return None for outputs that shouldn't update
|
| 144 |
return image, None, None, None
|
| 145 |
-
|
| 146 |
video_feed.change(
|
| 147 |
process_webcam,
|
| 148 |
inputs=[video_feed],
|
|
@@ -169,5 +156,7 @@ def create_app():
|
|
| 169 |
return app
|
| 170 |
|
| 171 |
if __name__ == "__main__":
|
|
|
|
|
|
|
| 172 |
app = create_app()
|
| 173 |
app.launch()
|
|
|
|
| 7 |
from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
| 9 |
import time
|
| 10 |
+
import nltk
|
| 11 |
+
from melo.api import TTS
|
| 12 |
+
import io
|
| 13 |
|
| 14 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 15 |
|
|
|
|
| 93 |
return response
|
| 94 |
|
| 95 |
def initialize_tts_model():
|
| 96 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 97 |
+
tts_model = TTS(language='EN', device=device)
|
| 98 |
+
speaker_ids = tts_model.hps.data.spk2id
|
| 99 |
+
return tts_model, speaker_ids
|
|
|
|
| 100 |
|
| 101 |
+
def text_to_speech(text, tts_model, speaker_id='EN-US', speed=1.0):
|
| 102 |
+
bio = io.BytesIO()
|
| 103 |
+
tts_model.tts_to_file(text, tts_model.hps.data.spk2id[speaker_id], bio, speed=speed, format='wav')
|
| 104 |
+
bio.seek(0)
|
| 105 |
+
return (24000, bio.read())
|
|
|
|
| 106 |
|
| 107 |
+
def process_frame(image, vision_components, llm_components, tts_model, speaker_id='EN-US'):
|
|
|
|
| 108 |
caption = analyze_image(image, vision_components)
|
|
|
|
|
|
|
| 109 |
roast = generate_roast(caption, llm_components)
|
| 110 |
+
audio = text_to_speech(roast, tts_model, speaker_id)
|
|
|
|
|
|
|
|
|
|
| 111 |
return caption, roast, audio
|
| 112 |
|
| 113 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
|
|
|
| 114 |
vision_components = initialize_vision_model()
|
| 115 |
llm_components = initialize_llm()
|
| 116 |
+
tts_model, speaker_ids = initialize_tts_model()
|
| 117 |
+
last_process_time = time.time() - 10
|
| 118 |
+
processing_interval = 5
|
|
|
|
|
|
|
| 119 |
def process_webcam(image):
|
| 120 |
nonlocal last_process_time
|
|
|
|
| 121 |
current_time = time.time()
|
| 122 |
if current_time - last_process_time >= processing_interval and image is not None:
|
| 123 |
last_process_time = current_time
|
|
|
|
| 124 |
caption, roast, audio = process_frame(
|
| 125 |
+
image,
|
| 126 |
+
vision_components,
|
| 127 |
+
llm_components,
|
| 128 |
+
tts_model,
|
| 129 |
+
'EN-US' # Default accent
|
| 130 |
)
|
|
|
|
| 131 |
return image, caption, roast, audio
|
|
|
|
|
|
|
| 132 |
return image, None, None, None
|
|
|
|
| 133 |
video_feed.change(
|
| 134 |
process_webcam,
|
| 135 |
inputs=[video_feed],
|
|
|
|
| 156 |
return app
|
| 157 |
|
| 158 |
if __name__ == "__main__":
|
| 159 |
+
os.system('python -m unidic download')
|
| 160 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
| 161 |
app = create_app()
|
| 162 |
app.launch()
|
requirements.txt
CHANGED
|
@@ -6,4 +6,7 @@ pillow
|
|
| 6 |
numpy
|
| 7 |
accelerate
|
| 8 |
git+https://github.com/huggingface/diffusers
|
| 9 |
-
opencv-python
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
numpy
|
| 7 |
accelerate
|
| 8 |
git+https://github.com/huggingface/diffusers
|
| 9 |
+
opencv-python
|
| 10 |
+
melo-tts
|
| 11 |
+
nltk
|
| 12 |
+
unidic-lite
|