CineAI commited on
Commit
13a15d8
·
verified ·
1 Parent(s): 615c558

Update audio2text/a2t.py

Browse files
Files changed (1) hide show
  1. audio2text/a2t.py +15 -7
audio2text/a2t.py CHANGED
@@ -5,6 +5,7 @@ from .init import pipe
5
  TASK = "transcribe"
6
  BATCH_SIZE = 8
7
  LIMIT = 60
 
8
 
9
  class A2T:
10
  def __init__(self, mic):
@@ -18,23 +19,30 @@ class A2T:
18
  return transcribed_text
19
 
20
  def __preprocces(self, raw: np.ndarray, sampling_rate: int):
21
- chunk = raw.astype(np.float32) / 32678.0
22
 
23
- # if sampling_rate != 16000:
24
- # chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=16000)
25
 
26
- # chunk = chunk[:16000*LIMIT]
 
 
 
 
 
 
 
 
27
 
28
  return chunk
29
 
30
  def predict(self):
31
  try:
32
  if self.mic is not None:
33
- chunk = self.mic.get_array_of_samples()
34
- chunk = np.array(chunk, dtype=np.int16)
35
  sampling_rate = self.mic.frame_rate
36
  audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
37
- print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)}")
38
  else:
39
  raise Exception("please provide audio")
40
 
 
5
  TASK = "transcribe"
6
  BATCH_SIZE = 8
7
  LIMIT = 60
8
+ SAMPLING_RATE = 16000
9
 
10
  class A2T:
11
  def __init__(self, mic):
 
19
  return transcribed_text
20
 
21
  def __preprocces(self, raw: np.ndarray, sampling_rate: int):
22
+ chunk = raw.astype(np.float32, order='C') / 32768.0
23
 
24
+ print(f"Chunk : {chunk}")
 
25
 
26
+ if len(chunk.shape) > 1:
27
+ chunk = librosa.to_mono(chunk.T)
28
+
29
+ if sampling_rate != SAMPLING_RATE:
30
+ chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=SAMPLING_RATE)
31
+
32
+ print(f"Sampling rate : {chunk}")
33
+
34
+ chunk = chunk[:SAMPLING_RATE*LIMIT]
35
 
36
  return chunk
37
 
38
  def predict(self):
39
  try:
40
  if self.mic is not None:
41
+ raw = self.mic.get_array_of_samples()
42
+ chunk = np.array(raw, dtype=np.int16)
43
  sampling_rate = self.mic.frame_rate
44
  audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
45
+ print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n raw audio : {raw} \n chunk : {chunk}")
46
  else:
47
  raise Exception("please provide audio")
48