geethareddy commited on
Commit
bb2885a
·
verified ·
1 Parent(s): 1313aad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -119
app.py CHANGED
@@ -1,22 +1,20 @@
 
1
  from flask import Flask, render_template, request, jsonify
2
  import os
3
- import torchafrom flask import Flask, render_template, request, jsonify
4
- import os
5
- import torch
6
  import re
7
- import ffmpeg # Ensure FFmpeg is installed
8
  from transformers import pipeline
9
  from gtts import gTTS
10
  from pydub import AudioSegment
11
  from pydub.silence import detect_nonsilent
12
  from waitress import serve
13
- import whisper # Improved Whisper ASR Model
14
 
15
  app = Flask(__name__)
16
 
17
  # Load Whisper Model for Highly Accurate Speech-to-Text
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
- asr_model = whisper.load_model("large-v3", device=device)
20
 
21
  # Function to generate audio prompts
22
  def generate_audio_prompt(text, filename):
@@ -47,7 +45,7 @@ SYMBOL_MAPPING = {
47
  "space": " "
48
  }
49
 
50
- # Function to convert audio to WAV format (Fixes FFmpeg issues)
51
  def convert_to_wav(input_path, output_path):
52
  try:
53
  audio = AudioSegment.from_file(input_path)
@@ -55,7 +53,7 @@ def convert_to_wav(input_path, output_path):
55
  except Exception as e:
56
  raise Exception(f"Audio conversion failed: {str(e)}")
57
 
58
- # Function to clean transcribed text (Removes unnecessary words)
59
  def clean_transcription(text):
60
  text = text.lower().strip()
61
  ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
@@ -67,11 +65,11 @@ def clean_transcription(text):
67
 
68
  return text.capitalize()
69
 
70
- # Function to check if the audio contains actual speech
71
  def is_silent_audio(audio_path):
72
  audio = AudioSegment.from_wav(audio_path)
73
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
74
- return len(nonsilent_parts) == 0 # Returns True if silence detected
75
 
76
  @app.route("/")
77
  def index():
@@ -83,130 +81,26 @@ def transcribe():
83
  return jsonify({"error": "No audio file provided"}), 400
84
 
85
  audio_file = request.files["audio"]
86
- input_audio_path = os.path.join("static", "temp_input")
87
  output_audio_path = os.path.join("static", "temp.wav")
88
  audio_file.save(input_audio_path)
89
 
90
  try:
91
- # Convert audio to WAV format
92
  convert_to_wav(input_audio_path, output_audio_path)
93
 
94
- # Check if the audio contains valid speech
95
  if is_silent_audio(output_audio_path):
96
  return jsonify({"error": "No speech detected. Please try again."}), 400
97
 
98
  # Transcribe using Whisper
99
- result = asr_model.transcribe(output_audio_path, language="en")
100
  transcribed_text = clean_transcription(result["text"])
101
 
102
  return jsonify({"text": transcribed_text})
103
  except Exception as e:
104
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
105
 
106
- # Use Waitress for Production Server
107
  if __name__ == "__main__":
108
  serve(app, host="0.0.0.0", port=7860)
109
-
110
- import re
111
- import ffmpeg # Ensures FFmpeg is installed
112
- from transformers import pipeline
113
- from gtts import gTTS
114
- from pydub import AudioSegment
115
- from pydub.silence import detect_nonsilent
116
- from waitress import serve
117
- import whisper_timestamped # Improved Whisper with timestamps
118
-
119
- app = Flask(__name__)
120
-
121
- # Load Whisper Model for Highly Accurate Speech-to-Text
122
- device = "cuda" if torch.cuda.is_available() else "cpu"
123
- asr_model = whisper_timestamped.load_model("medium", device=device)
124
-
125
- # Function to generate audio prompts
126
- def generate_audio_prompt(text, filename):
127
- tts = gTTS(text=text, lang="en")
128
- tts.save(os.path.join("static", filename))
129
-
130
- # Generate required voice prompts
131
- prompts = {
132
- "welcome": "Welcome to Biryani Hub.",
133
- "ask_name": "Tell me your name.",
134
- "ask_email": "Please provide your email address.",
135
- "thank_you": "Thank you for registration."
136
- }
137
-
138
- for key, text in prompts.items():
139
- generate_audio_prompt(text, f"{key}.mp3")
140
-
141
- # Symbol mapping for proper recognition
142
- SYMBOL_MAPPING = {
143
- "at the rate": "@",
144
- "at": "@",
145
- "dot": ".",
146
- "underscore": "_",
147
- "hash": "#",
148
- "plus": "+",
149
- "dash": "-",
150
- "comma": ",",
151
- "space": " "
152
- }
153
-
154
- # Function to convert audio to WAV format (Fixes FFmpeg issues)
155
- def convert_to_wav(input_path, output_path):
156
- try:
157
- audio = AudioSegment.from_file(input_path)
158
- audio.export(output_path, format="wav")
159
- except Exception as e:
160
- raise Exception(f"Audio conversion failed: {str(e)}")
161
-
162
- # Function to clean transcribed text (Removes unnecessary words)
163
- def clean_transcription(text):
164
- text = text.lower().strip()
165
- ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
166
- for phrase in ignore_phrases:
167
- text = text.replace(phrase, "").strip()
168
-
169
- for word, symbol in SYMBOL_MAPPING.items():
170
- text = text.replace(word, symbol)
171
-
172
- return text.capitalize()
173
-
174
- # Function to check if the audio contains actual speech
175
- def is_silent_audio(audio_path):
176
- audio = AudioSegment.from_wav(audio_path)
177
- nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
178
- return len(nonsilent_parts) == 0 # Returns True if silence detected
179
-
180
- @app.route("/")
181
- def index():
182
- return render_template("index.html")
183
-
184
- @app.route("/transcribe", methods=["POST"])
185
- def transcribe():
186
- if "audio" not in request.files:
187
- return jsonify({"error": "No audio file provided"}), 400
188
-
189
- audio_file = request.files["audio"]
190
- input_audio_path = os.path.join("static", "temp_input")
191
- output_audio_path = os.path.join("static", "temp.wav")
192
- audio_file.save(input_audio_path)
193
-
194
- try:
195
- # Convert audio to WAV format
196
- convert_to_wav(input_audio_path, output_audio_path)
197
-
198
- # Check if the audio contains valid speech
199
- if is_silent_audio(output_audio_path):
200
- return jsonify({"error": "No speech detected. Please try again."}), 400
201
-
202
- # Transcribe using Whisper
203
- result = asr_model.transcribe(output_audio_path, language="en")
204
- transcribed_text = clean_transcription(result["text"])
205
-
206
- return jsonify({"text": transcribed_text})
207
- except Exception as e:
208
- return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
209
-
210
- # Use Waitress for Production Server
211
- if __name__ == "__main__":
212
- serve(app, host="0.0.0.0", port=7860)
 
1
+ import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
 
 
 
4
  import re
5
+ import ffmpeg
6
  from transformers import pipeline
7
  from gtts import gTTS
8
  from pydub import AudioSegment
9
  from pydub.silence import detect_nonsilent
10
  from waitress import serve
11
+ import whisper # Corrected whisper import
12
 
13
  app = Flask(__name__)
14
 
15
  # Load Whisper Model for Highly Accurate Speech-to-Text
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ asr_model = whisper.load_model("large-v3")
18
 
19
  # Function to generate audio prompts
20
  def generate_audio_prompt(text, filename):
 
45
  "space": " "
46
  }
47
 
48
+ # Function to convert audio to WAV format
49
  def convert_to_wav(input_path, output_path):
50
  try:
51
  audio = AudioSegment.from_file(input_path)
 
53
  except Exception as e:
54
  raise Exception(f"Audio conversion failed: {str(e)}")
55
 
56
+ # Function to clean transcribed text
57
  def clean_transcription(text):
58
  text = text.lower().strip()
59
  ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
 
65
 
66
  return text.capitalize()
67
 
68
+ # Function to check if audio contains actual speech
69
  def is_silent_audio(audio_path):
70
  audio = AudioSegment.from_wav(audio_path)
71
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
72
+ return len(nonsilent_parts) == 0
73
 
74
  @app.route("/")
75
  def index():
 
81
  return jsonify({"error": "No audio file provided"}), 400
82
 
83
  audio_file = request.files["audio"]
84
+ input_audio_path = os.path.join("static", "temp_input.wav")
85
  output_audio_path = os.path.join("static", "temp.wav")
86
  audio_file.save(input_audio_path)
87
 
88
  try:
89
+ # Convert to WAV
90
  convert_to_wav(input_audio_path, output_audio_path)
91
 
92
+ # Check for silence
93
  if is_silent_audio(output_audio_path):
94
  return jsonify({"error": "No speech detected. Please try again."}), 400
95
 
96
  # Transcribe using Whisper
97
+ result = asr_model.transcribe(output_audio_path)
98
  transcribed_text = clean_transcription(result["text"])
99
 
100
  return jsonify({"text": transcribed_text})
101
  except Exception as e:
102
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
103
 
104
+ # Start Waitress Production Server
105
  if __name__ == "__main__":
106
  serve(app, host="0.0.0.0", port=7860)