vineelpratap commited on
Commit
d697dab
1 Parent(s): 90945f2

Update lid.py

Browse files
Files changed (1) hide show
  1. lid.py +15 -19
lid.py CHANGED
@@ -1,6 +1,7 @@
1
  from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
2
  import torch
3
  import librosa
 
4
 
5
  model_id = "facebook/mms-lid-1024"
6
 
@@ -19,21 +20,16 @@ with open(f"data/lid/all_langs.tsv") as f:
19
  LID_LANGUAGES[iso] = name
20
 
21
 
22
- def identify(audio_source=None, microphone=None, file_upload=None):
23
- if audio_source is None and microphone is None and file_upload is None:
24
- # HACK: need to handle this case for some reason
25
- return {}
26
-
27
- if type(microphone) is dict:
28
- # HACK: microphone variable is a dict when running on examples
29
- microphone = microphone["name"]
30
- audio_fp = (
31
- file_upload if "upload" in str(audio_source or "").lower() else microphone
32
- )
33
- if audio_fp is None:
34
- return "ERROR: You have to either use the microphone or upload an audio file"
35
-
36
- audio_samples = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0]
37
 
38
  inputs = processor(
39
  audio_samples, sampling_rate=LID_SAMPLING_RATE, return_tensors="pt"
@@ -67,7 +63,7 @@ def identify(audio_source=None, microphone=None, file_upload=None):
67
 
68
 
69
  LID_EXAMPLES = [
70
- [None, "./assets/english.mp3", None],
71
- [None, "./assets/tamil.mp3", None],
72
- [None, "./assets/burmese.mp3", None],
73
- ]
 
1
  from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
2
  import torch
3
  import librosa
4
+ import numpy as np
5
 
6
  model_id = "facebook/mms-lid-1024"
7
 
 
20
  LID_LANGUAGES[iso] = name
21
 
22
 
23
+ def identify(audio_data):
24
+ if isinstance(audio_data, tuple):
25
+ # microphone
26
+ sr, audio_samples = audio_data
27
+ audio_samples = (audio_samples / 32768.0).astype(np.float)
28
+ assert sr == LID_SAMPLING_RATE, "Invalid sampling rate"
29
+ else:
30
+ # file upload
31
+ isinstance(audio_data, str)
32
+ audio_samples = librosa.load(audio_data, sr=LID_SAMPLING_RATE, mono=True)[0]
 
 
 
 
 
33
 
34
  inputs = processor(
35
  audio_samples, sampling_rate=LID_SAMPLING_RATE, return_tensors="pt"
 
63
 
64
 
65
  LID_EXAMPLES = [
66
+ ["./assets/english.mp3"],
67
+ ["./assets/tamil.mp3"],
68
+ ["./assets/burmese.mp3"],
69
+ ]