oceansweep commited on
Commit
0c30c9f
1 Parent(s): 70bce05

Update App_Function_Libraries/Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,254 +1,202 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- import configparser
27
- # DEBUG Imports
28
- #from memory_profiler import profile
29
- import pyaudio
30
-
31
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
32
-
33
- # Import Local
34
- #
35
- #######################################################################################################################
36
- # Function Definitions
37
- #
38
-
39
- # Convert video .m4a into .wav using ffmpeg
40
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
- # https://www.gyan.dev/ffmpeg/builds/
42
- #
43
-
44
-
45
- whisper_model_instance = None
46
- config = load_comprehensive_config()
47
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
-
49
-
50
- # FIXME: This is a temporary solution.
51
- # This doesn't clear older models, which means potentially a lot of memory is being used...
52
- def get_whisper_model(model_name, device):
53
- global whisper_model_instance
54
- if whisper_model_instance is None:
55
- from faster_whisper import WhisperModel
56
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
57
- whisper_model_instance = WhisperModel(model_name, device=device)
58
- return whisper_model_instance
59
-
60
-
61
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
62
- #DEBUG
63
- #@profile
64
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
65
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
66
-
67
- if os.path.exists(out_path) and not overwrite:
68
- print(f"File '{out_path}' already exists. Skipping conversion.")
69
- logging.info(f"Skipping conversion as file already exists: {out_path}")
70
- return out_path
71
- print("Starting conversion process of .m4a to .WAV")
72
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
73
-
74
- try:
75
- if os.name == "nt":
76
- logging.debug("ffmpeg being ran on windows")
77
-
78
- if sys.platform.startswith('win'):
79
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
80
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
81
- else:
82
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
83
-
84
- command = [
85
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
86
- "-ss", "00:00:00", # Start at the beginning of the video
87
- "-i", video_file_path,
88
- "-ar", "16000", # Audio sample rate
89
- "-ac", "1", # Number of audio channels
90
- "-c:a", "pcm_s16le", # Audio codec
91
- out_path
92
- ]
93
- try:
94
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
95
- with open(os.devnull, 'rb') as null_file:
96
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
97
- if result.returncode == 0:
98
- logging.info("FFmpeg executed successfully")
99
- logging.debug("FFmpeg output: %s", result.stdout)
100
- else:
101
- logging.error("Error in running FFmpeg")
102
- logging.error("FFmpeg stderr: %s", result.stderr)
103
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
104
- except Exception as e:
105
- logging.error("Error occurred - ffmpeg doesn't like windows")
106
- raise RuntimeError("ffmpeg failed")
107
- elif os.name == "posix":
108
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
109
- else:
110
- raise RuntimeError("Unsupported operating system")
111
- logging.info("Conversion to WAV completed: %s", out_path)
112
- except subprocess.CalledProcessError as e:
113
- logging.error("Error executing FFmpeg command: %s", str(e))
114
- raise RuntimeError("Error converting video file to WAV")
115
- except Exception as e:
116
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
117
- return {"error": str(e)}
118
- gc.collect()
119
- return out_path
120
-
121
-
122
- # Transcribe .wav into .segments.json
123
- #DEBUG
124
- #@profile
125
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
126
- global whisper_model_instance, processing_choice
127
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
128
-
129
- time_start = time.time()
130
- if audio_file_path is None:
131
- raise ValueError("speech-to-text: No audio file provided")
132
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
133
-
134
- try:
135
- _, file_ending = os.path.splitext(audio_file_path)
136
- out_file = audio_file_path.replace(file_ending, ".segments.json")
137
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
138
- if os.path.exists(out_file):
139
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
140
- with open(out_file) as f:
141
- global segments
142
- segments = json.load(f)
143
- return segments
144
-
145
- logging.info('speech-to-text: Starting transcription...')
146
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
147
- transcribe_options = dict(task="transcribe", **options)
148
- # use function and config at top of file
149
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
150
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
151
-
152
- segments = []
153
- for segment_chunk in segments_raw:
154
- chunk = {
155
- "Time_Start": segment_chunk.start,
156
- "Time_End": segment_chunk.end,
157
- "Text": segment_chunk.text
158
- }
159
- logging.debug("Segment: %s", chunk)
160
- segments.append(chunk)
161
- # Print to verify its working
162
- print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
163
-
164
- # Log it as well.
165
- logging.debug(
166
- f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
167
-
168
- if segments:
169
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
170
-
171
- if not segments:
172
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
173
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
174
-
175
- # Save the segments to a JSON file - prettified and non-prettified
176
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
177
- save_json = True
178
- if save_json:
179
- logging.info("speech-to-text: Saving segments to JSON file")
180
- output_data = {'segments': segments}
181
-
182
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
183
- with open(prettified_out_file, 'w') as f:
184
- json.dump(output_data, f, indent=2)
185
-
186
- logging.info("speech-to-text: Saving JSON to %s", out_file)
187
- with open(out_file, 'w') as f:
188
- json.dump(output_data, f)
189
-
190
- logging.debug(f"speech-to-text: returning {segments[:500]}")
191
- gc.collect()
192
- return segments
193
-
194
- except Exception as e:
195
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
196
- raise RuntimeError("speech-to-text: Error transcribing audio")
197
-
198
-
199
- def record_audio(duration, sample_rate=16000, chunk_size=1024):
200
- p = pyaudio.PyAudio()
201
- stream = p.open(format=pyaudio.paInt16,
202
- channels=1,
203
- rate=sample_rate,
204
- input=True,
205
- frames_per_buffer=chunk_size)
206
-
207
- print("Recording...")
208
- frames = []
209
- stop_recording = threading.Event()
210
- audio_queue = queue.Queue()
211
-
212
- def audio_callback():
213
- for _ in range(0, int(sample_rate / chunk_size * duration)):
214
- if stop_recording.is_set():
215
- break
216
- data = stream.read(chunk_size)
217
- audio_queue.put(data)
218
-
219
- audio_thread = threading.Thread(target=audio_callback)
220
- audio_thread.start()
221
-
222
- return p, stream, audio_queue, stop_recording, audio_thread
223
-
224
-
225
- def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
226
- stop_recording_event.set()
227
- audio_thread.join()
228
-
229
- frames = []
230
- while not audio_queue.empty():
231
- frames.append(audio_queue.get())
232
-
233
- print("Recording finished.")
234
-
235
- stream.stop_stream()
236
- stream.close()
237
- p.terminate()
238
-
239
- return b''.join(frames)
240
-
241
- def save_audio_temp(audio_data, sample_rate=16000):
242
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
243
- import wave
244
- wf = wave.open(temp_file.name, 'wb')
245
- wf.setnchannels(1)
246
- wf.setsampwidth(2)
247
- wf.setframerate(sample_rate)
248
- wf.writeframes(audio_data)
249
- wf.close()
250
- return temp_file.name
251
-
252
- #
253
- #
254
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ import configparser
27
+ # DEBUG Imports
28
+ #from memory_profiler import profile
29
+ #import pyaudio
30
+
31
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
32
+
33
+ # Import Local
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ # Convert video .m4a into .wav using ffmpeg
40
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
+ # https://www.gyan.dev/ffmpeg/builds/
42
+ #
43
+
44
+
45
+ whisper_model_instance = None
46
+ config = load_comprehensive_config()
47
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
+
49
+
50
+ # FIXME: This is a temporary solution.
51
+ # This doesn't clear older models, which means potentially a lot of memory is being used...
52
+ def get_whisper_model(model_name, device):
53
+ global whisper_model_instance
54
+ if whisper_model_instance is None:
55
+ from faster_whisper import WhisperModel
56
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
57
+ whisper_model_instance = WhisperModel(model_name, device=device)
58
+ return whisper_model_instance
59
+
60
+
61
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
62
+ #DEBUG
63
+ #@profile
64
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
65
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
66
+
67
+ if os.path.exists(out_path) and not overwrite:
68
+ print(f"File '{out_path}' already exists. Skipping conversion.")
69
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
70
+ return out_path
71
+ print("Starting conversion process of .m4a to .WAV")
72
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
73
+
74
+ try:
75
+ if os.name == "nt":
76
+ logging.debug("ffmpeg being ran on windows")
77
+
78
+ if sys.platform.startswith('win'):
79
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
80
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
81
+ else:
82
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
83
+
84
+ command = [
85
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
86
+ "-ss", "00:00:00", # Start at the beginning of the video
87
+ "-i", video_file_path,
88
+ "-ar", "16000", # Audio sample rate
89
+ "-ac", "1", # Number of audio channels
90
+ "-c:a", "pcm_s16le", # Audio codec
91
+ out_path
92
+ ]
93
+ try:
94
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
95
+ with open(os.devnull, 'rb') as null_file:
96
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
97
+ if result.returncode == 0:
98
+ logging.info("FFmpeg executed successfully")
99
+ logging.debug("FFmpeg output: %s", result.stdout)
100
+ else:
101
+ logging.error("Error in running FFmpeg")
102
+ logging.error("FFmpeg stderr: %s", result.stderr)
103
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
104
+ except Exception as e:
105
+ logging.error("Error occurred - ffmpeg doesn't like windows")
106
+ raise RuntimeError("ffmpeg failed")
107
+ elif os.name == "posix":
108
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
109
+ else:
110
+ raise RuntimeError("Unsupported operating system")
111
+ logging.info("Conversion to WAV completed: %s", out_path)
112
+ except subprocess.CalledProcessError as e:
113
+ logging.error("Error executing FFmpeg command: %s", str(e))
114
+ raise RuntimeError("Error converting video file to WAV")
115
+ except Exception as e:
116
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
117
+ return {"error": str(e)}
118
+ gc.collect()
119
+ return out_path
120
+
121
+
122
+ # Transcribe .wav into .segments.json
123
+ #DEBUG
124
+ #@profile
125
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
126
+ global whisper_model_instance, processing_choice
127
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
128
+
129
+ time_start = time.time()
130
+ if audio_file_path is None:
131
+ raise ValueError("speech-to-text: No audio file provided")
132
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
133
+
134
+ try:
135
+ _, file_ending = os.path.splitext(audio_file_path)
136
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
137
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
138
+ if os.path.exists(out_file):
139
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
140
+ with open(out_file) as f:
141
+ global segments
142
+ segments = json.load(f)
143
+ return segments
144
+
145
+ logging.info('speech-to-text: Starting transcription...')
146
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
147
+ transcribe_options = dict(task="transcribe", **options)
148
+ # use function and config at top of file
149
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
150
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
151
+
152
+ segments = []
153
+ for segment_chunk in segments_raw:
154
+ chunk = {
155
+ "Time_Start": segment_chunk.start,
156
+ "Time_End": segment_chunk.end,
157
+ "Text": segment_chunk.text
158
+ }
159
+ logging.debug("Segment: %s", chunk)
160
+ segments.append(chunk)
161
+ # Print to verify its working
162
+ print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
163
+
164
+ # Log it as well.
165
+ logging.debug(
166
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
167
+
168
+ if segments:
169
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
170
+
171
+ if not segments:
172
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
173
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
174
+
175
+ # Save the segments to a JSON file - prettified and non-prettified
176
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
177
+ save_json = True
178
+ if save_json:
179
+ logging.info("speech-to-text: Saving segments to JSON file")
180
+ output_data = {'segments': segments}
181
+
182
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
183
+ with open(prettified_out_file, 'w') as f:
184
+ json.dump(output_data, f, indent=2)
185
+
186
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
187
+ with open(out_file, 'w') as f:
188
+ json.dump(output_data, f)
189
+
190
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
191
+ gc.collect()
192
+ return segments
193
+
194
+ except Exception as e:
195
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
196
+ raise RuntimeError("speech-to-text: Error transcribing audio")
197
+
198
+
199
+
200
+ #
201
+ #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  #######################################################################################################################