bachtom125 commited on
Commit
41aed3f
·
1 Parent(s): a10071f

add: conversion from m4a to wav

Browse files
Files changed (1) hide show
  1. utils/general_utils.py +41 -14
utils/general_utils.py CHANGED
@@ -38,23 +38,50 @@ async def process_audio(audio, device):
38
  return audio_cache.contains_without_lock(filename)
39
  logging.info(f"Processing audio '{filename}'.")
40
 
41
- # Read and preprocess the audio
42
- audio_bytes = BytesIO(await audio.read())
43
- audio_segment = AudioSegment.from_file(audio_bytes, format="m4a")
44
- audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
45
- max_val = np.iinfo(np.int16).max
46
- audio_samples /= max_val
47
 
48
- if audio_segment.channels > 1:
49
- audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
52
- # input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
 
 
 
 
53
 
54
- # Cache the processed audio
55
- cache_entry = {"audio_input": audio_input, "input_values": None, "ssl_logits": None}
56
- audio_cache.set_without_lock(filename, cache_entry)
57
- return cache_entry
 
 
 
 
 
 
 
 
58
 
59
  def clean_text(text: str) -> str:
60
  """
 
38
  return audio_cache.contains_without_lock(filename)
39
  logging.info(f"Processing audio '{filename}'.")
40
 
41
+ # Read the audio file into a temporary file
42
+ with NamedTemporaryFile(delete=False, suffix=".m4a") as temp_m4a:
43
+ temp_m4a_path = temp_m4a.name
44
+ temp_m4a.write(await audio.read())
 
 
45
 
46
+ # Convert M4A to WAV using FFmpeg
47
+ temp_wav_path = temp_m4a_path.replace(".m4a", ".wav")
48
+ try:
49
+ subprocess.run(
50
+ [
51
+ "ffmpeg", "-i", temp_m4a_path, # Input file
52
+ "-ar", "16000", # Resample to 16kHz
53
+ "-ac", "1", # Convert to mono
54
+ temp_wav_path # Output file
55
+ ],
56
+ check=True,
57
+ stdout=subprocess.PIPE,
58
+ stderr=subprocess.PIPE
59
+ )
60
+ except subprocess.CalledProcessError as e:
61
+ logging.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
62
+ raise HTTPException(status_code=500, detail="Failed to process audio file.")
63
+ finally:
64
+ os.remove(temp_m4a_path) # Clean up the temporary M4A file
65
 
66
+ try:
67
+ # Read and preprocess the audio
68
+ audio_segment = AudioSegment.from_file(temp_wav_path, format="wav")
69
+ audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
70
+ max_val = np.iinfo(np.int16).max
71
+ audio_samples /= max_val
72
 
73
+ if audio_segment.channels > 1:
74
+ audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
75
+
76
+ audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
77
+ # input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
78
+
79
+ # Cache the processed audio
80
+ cache_entry = {"audio_input": audio_input, "input_values": None, "ssl_logits": None}
81
+ audio_cache.set_without_lock(filename, cache_entry)
82
+ return cache_entry
83
+ finally:
84
+ os.remove(temp_wav_path)
85
 
86
  def clean_text(text: str) -> str:
87
  """