nithinraok commited on
Commit
d68b1ee
·
1 Parent(s): a0314cc

Add ASR Text

Browse files
Files changed (1) hide show
  1. app.py +82 -9
app.py CHANGED
@@ -1,23 +1,96 @@
1
- import nemo
2
  from nemo.collections.asr.models.msdd_models import NeuralDiarizer
 
 
3
  import gradio as gr
4
  import pandas as pd
5
  import torch
 
 
 
6
 
7
  device = "cuda" if torch.cuda.is_available() else "cpu"
8
 
9
  model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
 
 
10
 
11
  def run_diarization(path1):
 
12
  annotation = model(path1, num_workers=0, batch_size=16)
13
  rttm=annotation.to_rttm()
14
- df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker'])
15
- for idx,line in enumerate(rttm.splitlines()):
 
 
 
 
 
 
 
 
16
  split = line.split()
17
- start_time, duration, speaker = split[3], split[4], split[7]
18
  end_time = float(start_time) + float(duration)
19
- df.loc[idx] = start_time, end_time, speaker
20
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  article = (
23
  "<p style='text-align: center'>"
@@ -38,7 +111,7 @@ microphone_interface = gr.Interface(
38
  title="Offline Speaker Diarization with NeMo",
39
  description="This demonstration will perform offline speaker diarization on an audio file using nemo",
40
  article=article,
41
- layout="horizontal",
42
  theme="huggingface",
43
  allow_flagging=False,
44
  live=False,
@@ -52,7 +125,7 @@ upload_interface = gr.Interface(
52
  title="Offline Speaker Diarization with NeMo",
53
  description="This demonstration will perform offline speaker diarization on an audio file using nemo",
54
  article=article,
55
- layout="horizontal",
56
  theme="huggingface",
57
  allow_flagging=False,
58
  live=False,
@@ -61,4 +134,4 @@ upload_interface = gr.Interface(
61
 
62
  demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
63
 
64
- demo.launch(enable_queue=True)
 
 
1
  from nemo.collections.asr.models.msdd_models import NeuralDiarizer
2
+ from nemo.collections.asr.models import EncDecRNNTBPEModel
3
+ from nemo.collections.asr.models import EncDecSpeakerLabelModel
4
  import gradio as gr
5
  import pandas as pd
6
  import torch
7
+ import json
8
+ from omegaconf import OmegaConf
9
+ import uuid
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
14
+ speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
15
+ model.eval()
16
 
17
  def run_diarization(path1):
18
+ print(path1)
19
  annotation = model(path1, num_workers=0, batch_size=16)
20
  rttm=annotation.to_rttm()
21
+ df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
22
+ lines = rttm.splitlines()
23
+ if len(lines) == 0:
24
+ df.loc[0] = 0, 0, 'No speaker found'
25
+ return df
26
+ start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
27
+ end_time = float(start_time) + float(duration)
28
+ df.loc[0] = start_time, end_time, prev_speaker, ''
29
+
30
+ for line in lines[1:]:
31
  split = line.split()
32
+ start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
33
  end_time = float(start_time) + float(duration)
34
+ if cur_speaker == prev_speaker:
35
+ df.loc[df.index[-1], 'end_time'] = end_time
36
+ else:
37
+ df.loc[len(df)] = start_time, end_time, cur_speaker, ''
38
+ prev_speaker = cur_speaker
39
+
40
+ hyp = get_transcripts(df, path1)
41
+
42
+ assert len(hyp) == len(df)
43
+
44
+ for i in range(len(df)):
45
+ df.loc[i, 'text'] = hyp[i]
46
+
47
+ return df
48
+
49
+ def create_manifest(df,audio_path):
50
+
51
+ filename = '/tmp/' + str(uuid.uuid4()) + '.json'
52
+ with open(filename, 'w') as f:
53
+ for i in range(len(df)):
54
+ start_time = df.iloc[i]['start_time']
55
+ end_time = df.iloc[i]['end_time']
56
+ speaker = df.iloc[i]['speaker']
57
+ dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
58
+ json.dump(dic, f)
59
+ f.write('\n')
60
+
61
+ return filename
62
+
63
+ def get_transcripts(df, audio_path):
64
+
65
+ filename = create_manifest(df,audio_path)
66
+ model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
67
+ model.eval()
68
+ config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
69
+ dataloader = model._setup_transcribe_dataloader(config)
70
+
71
+ hypotheses = []
72
+ all_hypotheses = []
73
+
74
+ for test_batch in (dataloader):
75
+ encoded, encoded_len = model.forward(
76
+ input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
77
+ )
78
+ best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
79
+ encoded,
80
+ encoded_len,
81
+ return_hypotheses=False,
82
+ partial_hypotheses=None,)
83
+
84
+ hypotheses += best_hyp
85
+ if all_hyp is not None:
86
+ all_hypotheses += all_hyp
87
+ else:
88
+ all_hypotheses += best_hyp
89
+
90
+ del encoded
91
+ del test_batch
92
+
93
+ return hypotheses
94
 
95
  article = (
96
  "<p style='text-align: center'>"
 
111
  title="Offline Speaker Diarization with NeMo",
112
  description="This demonstration will perform offline speaker diarization on an audio file using nemo",
113
  article=article,
114
+ layout="vertical",
115
  theme="huggingface",
116
  allow_flagging=False,
117
  live=False,
 
125
  title="Offline Speaker Diarization with NeMo",
126
  description="This demonstration will perform offline speaker diarization on an audio file using nemo",
127
  article=article,
128
+ layout="vertical",
129
  theme="huggingface",
130
  allow_flagging=False,
131
  live=False,
 
134
 
135
  demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
136
 
137
+ demo.launch(enable_queue=True)