Spaces:

Kr08
/

ASR_gradio

Build error

App Files Files Community

Kr08 commited on Sep 4, 2024

Commit

6e73abb

verified ·

1 Parent(s): 64f2bf5

Update audio_processing.py

Browse files

Files changed (1) hide show

audio_processing.py +37 -46

audio_processing.py CHANGED Viewed

@@ -19,6 +19,26 @@ OVERLAP = 2
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def preprocess_audio(audio, chunk_size=CHUNK_LENGTH*16000, overlap=OVERLAP*16000):
     chunks = []
     for i in range(0, len(audio), chunk_size - overlap):
@@ -28,24 +48,25 @@ def preprocess_audio(audio, chunk_size=CHUNK_LENGTH*16000, overlap=OVERLAP*16000
         chunks.append(chunk)
     return chunks
-@spaces.GPU(duration=120)
 def process_audio(audio_file, translate=False, model_size="small"):
     start_time = time.time()
     try:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        compute_type = "float16" if device == "cuda" else "float32"
         audio = whisperx.load_audio(audio_file)
-        model = whisperx.load_model(model_size, device, compute_type=compute_type)
-        # Try to initialize diarization pipeline, but proceed without it if there's an error
-        try:
-            diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
-            diarization_pipeline = diarization_pipeline.to(torch.device(device))
-            diarization_result = diarization_pipeline({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000})
-        except Exception as e:
-            logger.warning(f"Diarization pipeline initialization failed: {str(e)}. Proceeding without diarization.")
-            diarization_result = None
         chunks = preprocess_audio(audio)
@@ -57,10 +78,10 @@ def process_audio(audio_file, translate=False, model_size="small"):
             chunk_start_time = i * (CHUNK_LENGTH - overlap_duration)
             chunk_end_time = chunk_start_time + CHUNK_LENGTH
             logger.info(f"Processing chunk {i+1}/{len(chunks)}")
-            lang = model.detect_language(chunk)
-            result_transcribe = model.transcribe(chunk, language=lang)
             if translate:
-                result_translate = model.transcribe(chunk, task="translate")
             chunk_start_time = i * (CHUNK_LENGTH - overlap_duration)
             for j, t_seg in enumerate(result_transcribe["segments"]):
                 segment_start = chunk_start_time + t_seg["start"]
@@ -115,34 +136,4 @@ def process_audio(audio_file, translate=False, model_size="small"):
         logger.error(f"An error occurred during audio processing: {str(e)}")
         raise
-def merge_nearby_segments(segments, time_threshold=0.5, similarity_threshold=0.7):
-    merged = []
-    for segment in segments:
-        if not merged or segment['start'] - merged[-1]['end'] > time_threshold:
-            merged.append(segment)
-        else:
-            # Find the overlap
-            matcher = SequenceMatcher(None, merged[-1]['text'], segment['text'])
-            match = matcher.find_longest_match(0, len(merged[-1]['text']), 0, len(segment['text']))
-            if match.size / len(segment['text']) > similarity_threshold:
-                # Merge the segments
-                merged_text = merged[-1]['text'] + segment['text'][match.b + match.size:]
-                merged_translated = merged[-1].get('translated', '') + segment.get('translated', '')[match.b + match.size:]
-                merged[-1]['end'] = segment['end']
-                merged[-1]['text'] = merged_text
-                if 'translated' in segment:
-                    merged[-1]['translated'] = merged_translated
-            else:
-                # If no significant overlap, append as a new segment
-                merged.append(segment)
-    return merged
-def print_results(segments):
-    for segment in segments:
-        print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:")
-        print(f"Original: {segment['text']}")
-        if 'translated' in segment:
-            print(f"Translated: {segment['translated']}")
-        print()

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Global variables for models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+compute_type = "float16" if device == "cuda" else "float32"
+whisper_model = None
+diarization_pipeline = None
+def load_models(model_size="small"):
+    global whisper_model, diarization_pipeline
+    # Load Whisper model
+    whisper_model = whisperx.load_model(model_size, device, compute_type=compute_type)
+    # Try to initialize diarization pipeline
+    try:
+        diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
+        diarization_pipeline = diarization_pipeline.to(torch.device(device))
+    except Exception as e:
+        logger.warning(f"Diarization pipeline initialization failed: {str(e)}. Diarization will not be available.")
+        diarization_pipeline = None
 def preprocess_audio(audio, chunk_size=CHUNK_LENGTH*16000, overlap=OVERLAP*16000):
     chunks = []
     for i in range(0, len(audio), chunk_size - overlap):
         chunks.append(chunk)
     return chunks
+@spaces.GPU
 def process_audio(audio_file, translate=False, model_size="small"):
+    global whisper_model, diarization_pipeline
+    if whisper_model is None:
+        load_models(model_size)
     start_time = time.time()
     try:
         audio = whisperx.load_audio(audio_file)
+        # Perform diarization if pipeline is available
+        diarization_result = None
+        if diarization_pipeline is not None:
+            try:
+                diarization_result = diarization_pipeline({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000})
+            except Exception as e:
+                logger.warning(f"Diarization failed: {str(e)}. Proceeding without diarization.")
         chunks = preprocess_audio(audio)
             chunk_start_time = i * (CHUNK_LENGTH - overlap_duration)
             chunk_end_time = chunk_start_time + CHUNK_LENGTH
             logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+            lang = whisper_model.detect_language(chunk)
+            result_transcribe = whisper_model.transcribe(chunk, language=lang)
             if translate:
+                result_translate = whisper_model.transcribe(chunk, task="translate")
             chunk_start_time = i * (CHUNK_LENGTH - overlap_duration)
             for j, t_seg in enumerate(result_transcribe["segments"]):
                 segment_start = chunk_start_time + t_seg["start"]
         logger.error(f"An error occurred during audio processing: {str(e)}")
         raise
+# The merge_nearby_segments and print_results functions remain unchanged