Spaces:

bachtom125
/

pronunciation-error-detection

Runtime error

App Files Files Community

bachtom125 commited on Jan 24

Commit

41aed3f

1 Parent(s): a10071f

add: conversion from m4a to wav

Browse files

Files changed (1) hide show

utils/general_utils.py +41 -14

utils/general_utils.py CHANGED Viewed

@@ -38,23 +38,50 @@ async def process_audio(audio, device):
             return audio_cache.contains_without_lock(filename)
         logging.info(f"Processing audio '{filename}'.")
-        # Read and preprocess the audio
-        audio_bytes = BytesIO(await audio.read())
-        audio_segment = AudioSegment.from_file(audio_bytes, format="m4a")
-        audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
-        max_val = np.iinfo(np.int16).max
-        audio_samples /= max_val
-        if audio_segment.channels > 1:
-            audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
-        audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
-        # input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
-        # Cache the processed audio
-        cache_entry = {"audio_input": audio_input, "input_values": None, "ssl_logits": None}
-        audio_cache.set_without_lock(filename, cache_entry)
-        return cache_entry
 def clean_text(text: str) -> str:
     """

             return audio_cache.contains_without_lock(filename)
         logging.info(f"Processing audio '{filename}'.")
+        # Read the audio file into a temporary file
+        with NamedTemporaryFile(delete=False, suffix=".m4a") as temp_m4a:
+            temp_m4a_path = temp_m4a.name
+            temp_m4a.write(await audio.read())
+        # Convert M4A to WAV using FFmpeg
+        temp_wav_path = temp_m4a_path.replace(".m4a", ".wav")
+        try:
+            subprocess.run(
+                [
+                    "ffmpeg", "-i", temp_m4a_path,  # Input file
+                    "-ar", "16000",                 # Resample to 16kHz
+                    "-ac", "1",                     # Convert to mono
+                    temp_wav_path                   # Output file
+                ],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+        except subprocess.CalledProcessError as e:
+            logging.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
+            raise HTTPException(status_code=500, detail="Failed to process audio file.")
+        finally:
+            os.remove(temp_m4a_path)  # Clean up the temporary M4A file
+        try:
+            # Read and preprocess the audio
+            audio_segment = AudioSegment.from_file(temp_wav_path, format="wav")
+            audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
+            max_val = np.iinfo(np.int16).max
+            audio_samples /= max_val
+            if audio_segment.channels > 1:
+                audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
+            audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
+            # input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
+            # Cache the processed audio
+            cache_entry = {"audio_input": audio_input, "input_values": None, "ssl_logits": None}
+            audio_cache.set_without_lock(filename, cache_entry)
+            return cache_entry
+        finally:
+            os.remove(temp_wav_path)
 def clean_text(text: str) -> str:
     """