Futuresony commited on
Commit
c9a70b0
·
verified ·
1 Parent(s): be18557

Update lid.py

Browse files
Files changed (1) hide show
  1. lid.py +41 -40
lid.py CHANGED
@@ -2,77 +2,78 @@ from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
2
  import torch
3
  import librosa
4
  import numpy as np
 
5
 
6
- model_id = "facebook/mms-lid-1024"
7
-
8
- processor = AutoFeatureExtractor.from_pretrained(model_id)
9
- model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)
10
-
11
 
 
12
  LID_SAMPLING_RATE = 16_000
13
- LID_TOPK = 10
14
- LID_THRESHOLD = 0.33
15
-
16
  LID_LANGUAGES = {}
17
- with open(f"data/lid/all_langs.tsv") as f:
 
 
 
 
 
 
18
  for line in f:
19
- iso, name = line.split(" ", 1)
20
  LID_LANGUAGES[iso] = name
21
 
22
-
23
- def identify(audio_data = None):
24
  if not audio_data:
25
  return "<<ERROR: Empty Audio Input>>"
26
-
 
27
  if isinstance(audio_data, tuple):
28
- # microphone
29
  sr, audio_samples = audio_data
30
  audio_samples = (audio_samples / 32768.0).astype(np.float32)
31
  if sr != LID_SAMPLING_RATE:
32
- audio_samples = librosa.resample(
33
- audio_samples, orig_sr=sr, target_sr=LID_SAMPLING_RATE
34
- )
35
- else:
36
- # file upload
37
- isinstance(audio_data, str)
38
- audio_samples = librosa.load(audio_data, sr=LID_SAMPLING_RATE, mono=True)[0]
39
 
40
- inputs = processor(
41
- audio_samples, sampling_rate=LID_SAMPLING_RATE, return_tensors="pt"
42
- )
 
 
43
 
44
- # set device
45
- if torch.cuda.is_available():
46
- device = torch.device("cuda")
47
- elif (
48
- hasattr(torch.backends, "mps")
49
- and torch.backends.mps.is_available()
50
- and torch.backends.mps.is_built()
51
- ):
52
- device = torch.device("mps")
53
  else:
54
- device = torch.device("cpu")
 
 
 
55
 
 
 
56
  model.to(device)
57
  inputs = inputs.to(device)
58
 
 
59
  with torch.no_grad():
60
  logit = model(**inputs).logits
61
 
 
62
  logit_lsm = torch.log_softmax(logit.squeeze(), dim=-1)
63
  scores, indices = torch.topk(logit_lsm, 5, dim=-1)
64
- scores, indices = torch.exp(scores).to("cpu").tolist(), indices.to("cpu").tolist()
 
 
65
  iso2score = {model.config.id2label[int(i)]: s for s, i in zip(scores, indices)}
 
 
66
  if max(iso2score.values()) < LID_THRESHOLD:
67
- return "Low confidence in the language identification predictions. Output is not shown!"
68
- return {LID_LANGUAGES[iso]: score for iso, score in iso2score.items()}
69
 
 
70
 
 
71
  LID_EXAMPLES = [
72
  ["upload/english.mp3"],
73
  ["upload/tamil.mp3"],
74
  ["upload/burmese.mp3"],
75
  ]
76
- demo.launch()
77
- ).launch()
78
- demo.launch()
 
2
  import torch
3
  import librosa
4
  import numpy as np
5
+ import os
6
 
7
+ # Load Facebook MMS Language Identification Model
8
+ MODEL_ID = "facebook/mms-lid-1024"
9
+ processor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
10
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID)
 
11
 
12
+ # Constants
13
  LID_SAMPLING_RATE = 16_000
14
+ LID_THRESHOLD = 0.33 # Confidence threshold
 
 
15
  LID_LANGUAGES = {}
16
+
17
+ # Load Language Labels
18
+ LANG_FILE = "data/lid/all_langs.tsv"
19
+ if not os.path.exists(LANG_FILE):
20
+ raise FileNotFoundError(f"Language file '{LANG_FILE}' not found!")
21
+
22
+ with open(LANG_FILE, encoding="utf-8") as f:
23
  for line in f:
24
+ iso, name = line.strip().split(" ", 1)
25
  LID_LANGUAGES[iso] = name
26
 
27
+ # Identify Audio Language
28
+ def identify(audio_data=None):
29
  if not audio_data:
30
  return "<<ERROR: Empty Audio Input>>"
31
+
32
+ # Microphone Input
33
  if isinstance(audio_data, tuple):
 
34
  sr, audio_samples = audio_data
35
  audio_samples = (audio_samples / 32768.0).astype(np.float32)
36
  if sr != LID_SAMPLING_RATE:
37
+ audio_samples = librosa.resample(audio_samples, orig_sr=sr, target_sr=LID_SAMPLING_RATE)
 
 
 
 
 
 
38
 
39
+ # File Upload
40
+ elif isinstance(audio_data, str):
41
+ if not os.path.exists(audio_data):
42
+ return f"<<ERROR: File '{audio_data}' not found!>>"
43
+ audio_samples, _ = librosa.load(audio_data, sr=LID_SAMPLING_RATE, mono=True)
44
 
 
 
 
 
 
 
 
 
 
45
  else:
46
+ return "<<ERROR: Invalid Audio Input>>"
47
+
48
+ # Process Input
49
+ inputs = processor(audio_samples, sampling_rate=LID_SAMPLING_RATE, return_tensors="pt")
50
 
51
+ # Select Device
52
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
53
  model.to(device)
54
  inputs = inputs.to(device)
55
 
56
+ # Predict Language
57
  with torch.no_grad():
58
  logit = model(**inputs).logits
59
 
60
+ # Compute Probabilities
61
  logit_lsm = torch.log_softmax(logit.squeeze(), dim=-1)
62
  scores, indices = torch.topk(logit_lsm, 5, dim=-1)
63
+ scores, indices = torch.exp(scores).cpu().tolist(), indices.cpu().tolist()
64
+
65
+ # Map to Language Labels
66
  iso2score = {model.config.id2label[int(i)]: s for s, i in zip(scores, indices)}
67
+
68
+ # Confidence Check
69
  if max(iso2score.values()) < LID_THRESHOLD:
70
+ return "Low confidence in language detection. No output shown."
 
71
 
72
+ return {LID_LANGUAGES.get(iso, iso): score for iso, score in iso2score.items()}
73
 
74
+ # Example Usage
75
  LID_EXAMPLES = [
76
  ["upload/english.mp3"],
77
  ["upload/tamil.mp3"],
78
  ["upload/burmese.mp3"],
79
  ]