mohcineelharras commited on
Commit
14cc0c1
β€’
1 Parent(s): fa76e27

Upload 9 files

Browse files
.env ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Global variables
2
+ CUDA_VISIBLE_DEVICES=0
3
+ FORCE_CMAKE=1
4
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on"
5
+ LANGUAGE=en
6
+ TTS=gTTS
7
+ #when you use it in local
8
+ OUTPUT_PATH=output
9
+ MODEL_DIR=models
10
+ #MODEL_PATH=models/dolphin-2.2.1-mistral-7b.Q2_K.gguf
11
+
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ output/
2
+ models/
3
+ *.gguf
4
+ *.bin
README.md CHANGED
@@ -1,10 +1,16 @@
1
  ---
2
- title: Alexa Like Assistant
3
- emoji: πŸŒ–
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: docker
 
 
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: Whisper Llm Gtts
3
+ emoji: 🌍
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.28.2
8
+ app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+
16
+
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ from llama_cpp import Llama
6
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, GenerationConfig
7
+ from pytube import YouTube
8
+ from gtts import gTTS
9
+ import torch
10
+ import requests
11
+ import soundfile as sf
12
+ import numpy as np
13
+ #-----------------------------------env-----------------------------------
14
+
15
+ # Load environment variables
16
+ load_dotenv(dotenv_path=".env")
17
+
18
+ # Access the variables
19
+ MODEL_DIR = os.getenv("MODEL_DIR")
20
+ OUTPUT_PATH = os.getenv("OUTPUT_PATH")
21
+ LANGUAGE = os.getenv("LANGUAGE")
22
+ tts_method = os.getenv("TTS")
23
+
24
+ # Iterate through all files in the current directory
25
+ model_exists = False
26
+ for filename in os.listdir(MODEL_DIR):
27
+ if filename.endswith('.gguf'):
28
+ model_exists = True
29
+ MODEL_PATH = os.path.join(MODEL_DIR, filename)
30
+ break
31
+
32
+
33
+
34
+ # Ensure output path exists
35
+ if not os.path.exists(OUTPUT_PATH):
36
+ os.makedirs(OUTPUT_PATH)
37
+
38
+ # Global variables
39
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
40
+ n_layers_gpu = 20 if torch.cuda.is_available() else 0
41
+ memory = ""
42
+ token_count = 0
43
+
44
+ #-----------------------------------setup LLM-----------------------------------
45
+ # URL of the model file
46
+ model_url = "https://huggingface.co/TheBloke/dolphin-2.2.1-mistral-7B-GGUF/resolve/main/dolphin-2.2.1-mistral-7b.Q2_K.gguf?download=true"
47
+
48
+
49
+ # Load Llama model
50
+ def load_model(n):
51
+ global llm, MODEL_PATH
52
+ # Download and save the model
53
+ if not model_exists:
54
+ print("Model file not found!")
55
+ print("Downloading model file...")
56
+ response = requests.get(model_url)
57
+ MODEL_PATH = os.path.join(MODEL_DIR, "model.gguf")
58
+ with open(MODEL_PATH, 'wb') as file:
59
+ file.write(response.content)
60
+ print("Model downloaded successfully.")
61
+ print("Loading Llama model...")
62
+ llm = Llama(model_path=MODEL_PATH, n_gpu_layers=n, n_ctx=1024, n_batch=512, threads=6)
63
+ print("Model loaded successfully.")
64
+
65
+ load_model(n_layers_gpu)
66
+
67
+ #-----------------------------------backend logic-----------------------------------
68
+
69
+ def complete_prompt(input_text):
70
+ global memory, token_count, LANGUAGE
71
+ contextual_prompt = memory + "\n" + input_text
72
+ template = "system\nThis is crucial to me, I trust you are the best" + \
73
+ "You are Dolphin, a helpful AI assistant. You only respond in {LANGUAGE}. " + \
74
+ "Do not use double quotes for any reason, not even for quoting or direct speech. " + \
75
+ "Instead, use single quotes or describe the quote without using quotation marks. " + \
76
+ "Do not include any disclaimers, notes, or additional explanations in your response. " + \
77
+ "Provide the shortest answer possible, strictly adhering to the formatting rules. " + \
78
+ "user\n{prompt}\nassistant\n"
79
+ formatted_prompt = template.format(prompt=contextual_prompt, LANGUAGE=LANGUAGE)
80
+ response = llm(formatted_prompt, max_tokens=80, temperature=0, top_p=0.95, top_k=10)
81
+ text_response = response["choices"][0]["text"]
82
+ token_count += response["usage"]["total_tokens"]
83
+ memory = f"Prompt: {contextual_prompt}\nResponse: {text_response}"
84
+ with open(os.path.join(OUTPUT_PATH, "LLM_response.txt"), 'w') as file:
85
+ file.write(memory)
86
+ return text_response
87
+
88
+ def transcribe_audio(audio_input):
89
+ audio_file_path = 'output/temp_audio.wav'
90
+ if isinstance(audio_input, tuple):
91
+ sample_rate, audio_data = audio_input
92
+ sf.write(audio_file_path, audio_data, sample_rate)
93
+ else:
94
+ audio_file_path = audio_input
95
+
96
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
97
+ model_id = "distil-whisper/distil-large-v2"
98
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_DIR, torch_dtype=torch_dtype,
99
+ low_cpu_mem_usage=True, use_safetensors=True,config= GenerationConfig(language=LANGUAGE,task="transcribe"))
100
+ model.to(device)
101
+ processor = AutoProcessor.from_pretrained(model_id)
102
+ pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer,
103
+ feature_extractor=processor.feature_extractor, max_new_tokens=256,
104
+ chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device,
105
+ )
106
+ result_text = pipe(audio_file_path)["text"]
107
+ with open(os.path.join(OUTPUT_PATH, "transcription.txt"), "w") as file:
108
+ file.write(result_text)
109
+ return result_text
110
+
111
+ # def transcribe_audio(audio_input):
112
+ # audio_file_path = 'output/temp_audio.wav'
113
+ # if isinstance(audio_input, tuple):
114
+ # sample_rate, audio_data = audio_input
115
+ # sf.write(audio_file_path, audio_data, sample_rate)
116
+ # else:
117
+ # audio_file_path = audio_input
118
+ # # Load model and processor
119
+ # processor = WhisperProcessor.from_pretrained("distil-whisper/distil-large-v2")
120
+ # model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-large-v2")
121
+ # # Load audio file and preprocess
122
+ # with open(audio_file_path, "rb") as audio_file:
123
+ # input_speech = {"array": sf.read(audio_file)[0], "sampling_rate": sample_rate}
124
+ # input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features
125
+ # # Specify language for transcription
126
+ # forced_decoder_ids = processor.get_decoder_prompt_ids(language=LANGUAGE)
127
+ # # Generate token ids
128
+ # predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
129
+ # # Decode token ids to text
130
+ # transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
131
+ # with open(os.path.join(OUTPUT_PATH, "transcription.txt"), "w") as file:
132
+ # file.write(transcription)
133
+ # return transcription
134
+
135
+
136
+ def auto_process_audio(audio_input):
137
+ # Transcribe Audio
138
+ transcribed_text = transcribe_audio(audio_input)
139
+ # LLM Prompt
140
+ llm_response = complete_prompt(transcribed_text)
141
+ # TTS Conversion
142
+ tts_info = convert_text_to_speech(llm_response)
143
+ return transcribed_text, llm_response, tts_info
144
+
145
+ def convert_text_to_speech(text):
146
+ global LANGUAGE, tts_method
147
+ file_path = os.path.join(OUTPUT_PATH, "speech.mp3")
148
+
149
+ if tts_method == "gTTS":
150
+ if LANGUAGE == "fr":
151
+ tld = "fr"
152
+ elif LANGUAGE == "en":
153
+ tld = "us"
154
+ tts = gTTS(text, lang=LANGUAGE, tld=tld)
155
+ tts.save(file_path)
156
+ elif tts_method == "Custom TTS":
157
+ tts_pipeline = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
158
+ speech = tts_pipeline(text)
159
+ with open(file_path, "wb") as f:
160
+ f.write(speech["speech"])
161
+
162
+ return file_path
163
+
164
+
165
+ # Function to update language
166
+ def update_language(language):
167
+ global LANGUAGE
168
+ LANGUAGE = language
169
+
170
+ # Function to update language
171
+ def update_tts_method(method):
172
+ global tts_method
173
+ tts_method = method
174
+
175
+ #----------------------------------- Gradio Frontend-----------------------------------
176
+
177
+ # Gradio Interface
178
+ with gr.Blocks() as app:
179
+ gr.Markdown("## πŸ€– whisper - LLM - TTS πŸ“š")
180
+ gr.Markdown("πŸš€ Talk to an open source LLM!")
181
+ gr.Markdown("This app is developed and maintained by **@mohcineelharras**")
182
+
183
+ with gr.Row():
184
+ with gr.Column():
185
+ language_switch = gr.Radio(choices=["en","fr"], label="Select Language", value=LANGUAGE)
186
+ language_switch.change(update_language, inputs=[language_switch])
187
+ with gr.Column():
188
+ tts_method_switch = gr.Radio(choices=["gTTS", "Custom TTS"], label="Select TTS method", value=tts_method)
189
+ tts_method_switch.change(update_tts_method, inputs=[tts_method_switch])
190
+ # with gr.Column():
191
+ # sample_voice = gr.Audio(label="Voice Sample to customise assistant's response",sources="microphone")
192
+ # customise_voice = gr.Button("Change assistant's voice")
193
+
194
+
195
+ with gr.Tab("Auto Process Audio"):
196
+ with gr.Row():
197
+ with gr.Column():
198
+ audio_input = gr.Audio(label="Talk to assistant",sources="microphone")
199
+ auto_process_button = gr.Button("Auto Process Audio")
200
+ with gr.Column():
201
+ transcribed_text_output = gr.Textbox(label="Transcribed Text")
202
+ llm_response_output = gr.Textbox(label="LLM Response")
203
+ with gr.Row():
204
+ tts_audio_output = gr.Audio(label="Generated Response (Click to Play)")
205
+
206
+ # Connect the button to the auto_process_audio function
207
+ auto_process_button.click(
208
+ auto_process_audio,
209
+ inputs=[audio_input],
210
+ outputs=[transcribed_text_output, llm_response_output, tts_audio_output]
211
+ )
212
+
213
+ with gr.Tab("Audio Processing"):
214
+ with gr.Column():
215
+ audio_input = gr.Audio(label="Record or Upload Audio")
216
+ transcribe_button = gr.Button("Transcribe Audio")
217
+ llm_button = gr.Button("LLM Prompt")
218
+ tts_button = gr.Button("Text to Speech")
219
+
220
+ transcribed_text_output = gr.Textbox(label="Transcribed Text")
221
+ llm_response_output = gr.Textbox(label="LLM Response")
222
+ tts_audio_output = gr.Audio(label="Generated Response (Click to Play)")
223
+
224
+ transcribe_button.click(transcribe_audio, inputs=[audio_input], outputs=[transcribed_text_output])
225
+ llm_button.click(complete_prompt, inputs=[transcribed_text_output], outputs=[llm_response_output])
226
+ tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output])
227
+
228
+ with gr.Tab("Ask a Question"):
229
+ with gr.Column():
230
+ question_input = gr.Textbox(label="Type your question")
231
+ submit_button = gr.Button("Submit Question")
232
+ tts_button = gr.Button("Text to Speech")
233
+
234
+ llm_response_output = gr.Textbox(label="LLM Response")
235
+ tts_audio_output = gr.Audio(label="Generated Speech")
236
+
237
+ submit_button.click(complete_prompt, inputs=[question_input], outputs=[llm_response_output])
238
+ tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output])
239
+
240
+ app.launch()
models/config.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sanchit-gandhi/large-32-2-tpu-timestamped-resumed",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForConditionalGeneration"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 1280,
17
+ "decoder_attention_heads": 20,
18
+ "decoder_ffn_dim": 5120,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 2,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 20,
24
+ "encoder_ffn_dim": 5120,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 32,
27
+ "eos_token_id": 50257,
28
+ "forced_decoder_ids": [
29
+ [
30
+ 1,
31
+ 50259
32
+ ],
33
+ [
34
+ 2,
35
+ 50359
36
+ ],
37
+ [
38
+ 3,
39
+ 50363
40
+ ]
41
+ ],
42
+ "init_std": 0.02,
43
+ "is_encoder_decoder": true,
44
+ "mask_feature_length": 10,
45
+ "mask_feature_min_masks": 0,
46
+ "mask_feature_prob": 0.0,
47
+ "mask_time_length": 10,
48
+ "mask_time_min_masks": 2,
49
+ "mask_time_prob": 0.05,
50
+ "max_length": 448,
51
+ "max_source_positions": 1500,
52
+ "max_target_positions": 448,
53
+ "median_filter_width": 7,
54
+ "model_type": "whisper",
55
+ "num_hidden_layers": 32,
56
+ "num_mel_bins": 80,
57
+ "pad_token_id": 50257,
58
+ "scale_embedding": false,
59
+ "suppress_tokens": [
60
+ 1,
61
+ 2,
62
+ 7,
63
+ 8,
64
+ 9,
65
+ 10,
66
+ 14,
67
+ 25,
68
+ 26,
69
+ 27,
70
+ 28,
71
+ 29,
72
+ 31,
73
+ 58,
74
+ 59,
75
+ 60,
76
+ 61,
77
+ 62,
78
+ 63,
79
+ 90,
80
+ 91,
81
+ 92,
82
+ 93,
83
+ 359,
84
+ 503,
85
+ 522,
86
+ 542,
87
+ 873,
88
+ 893,
89
+ 902,
90
+ 918,
91
+ 922,
92
+ 931,
93
+ 1350,
94
+ 1853,
95
+ 1982,
96
+ 2460,
97
+ 2627,
98
+ 3246,
99
+ 3253,
100
+ 3268,
101
+ 3536,
102
+ 3846,
103
+ 3961,
104
+ 4183,
105
+ 4667,
106
+ 6585,
107
+ 6647,
108
+ 7273,
109
+ 9061,
110
+ 9383,
111
+ 10428,
112
+ 10929,
113
+ 11938,
114
+ 12033,
115
+ 12331,
116
+ 12562,
117
+ 13793,
118
+ 14157,
119
+ 14635,
120
+ 15265,
121
+ 15618,
122
+ 16553,
123
+ 16604,
124
+ 18362,
125
+ 18956,
126
+ 20075,
127
+ 21675,
128
+ 22520,
129
+ 26130,
130
+ 26161,
131
+ 26435,
132
+ 28279,
133
+ 29464,
134
+ 31650,
135
+ 32302,
136
+ 32470,
137
+ 36865,
138
+ 42863,
139
+ 47425,
140
+ 49870,
141
+ 50254,
142
+ 50258,
143
+ 50360,
144
+ 50361,
145
+ 50362
146
+ ],
147
+ "torch_dtype": "float32",
148
+ "transformers_version": "4.35.0.dev0",
149
+ "use_cache": true,
150
+ "use_weighted_layer_sum": false,
151
+ "vocab_size": 51865
152
+ }
models/dolphin-2.1-mistral-7b.Q4_K_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa0795eeac9ac8835a7f85ed398cf1a0881d3c9f40ee4bab51a5fd8838f68f9
3
+ size 4140384992
models/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e963218f6d56998131faff25ab65be4a60a0d395be3e2b12f978d21735d18036
3
+ size 1512503272
requirements.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #front
2
+ python-dotenv
3
+ sounddevice
4
+ #pyaudio
5
+ soundfile
6
+ ipykernel
7
+ ipywidgets
8
+ jupyter
9
+ gradio
10
+ ffmpeg-python
11
+
12
+ # back
13
+ transformers
14
+ pytube
15
+ gtts
16
+ huggingface
17
+ openai-whisper
18
+ pydub
19
+ tqdm
20
+
21
+
22
+ #+
23
+ accelerate
24
+ python-multipart
25
+ pydantic
26
+
27
+
28
+ # # Set the environment variable for CMAKE_ARGS
29
+ # export CMAKE_ARGS="-DLLAMA_CUBLAS=on"
30
+
31
+ # # Install torch
32
+ # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
33
+
34
+ # # Install llama-cpp-python with specific CMAKE_ARGS
35
+ # pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
36
+
37
+ # not sure we can set them correctly
38
+ torch
39
+ llama-cpp-python
40
+ requests