Spaces:

ktangri
/

call-sentiment

Runtime error

App Files Files Community

ktangri commited on Apr 6, 2022

Commit

c150302

•

1 Parent(s): eb6ba59

Adding speaker segmentation

Browse files

Files changed (2) hide show

app.py +24 -5
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import gradio as gr
 from transformers import pipeline, Wav2Vec2ProcessorWithLM
 from librosa import load, resample
 from rpunct import RestorePuncts
 asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
 processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
 asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
 rpunct = RestorePuncts()
@@ -13,19 +15,36 @@ def transcribe(filepath):
 	speech, sampling_rate = load(filepath)
 	if sampling_rate != 16000:
 		speech = resample(speech, sampling_rate, 16000)
-	text = asr(speech)['text']
-	text = rpunct.punctuate(text.lower())
-	return text
 mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
-transcript = gr.outputs.Textbox(type='auto', label='Transcription')
 iface = gr.Interface(
 	theme='huggingface',
 	description='Testing transcription',
 	fn=transcribe,
 	inputs=[mic],
-	outputs=[transcript]
 )
 iface.launch()

 import gradio as gr
 from transformers import pipeline, Wav2Vec2ProcessorWithLM
+from pyannote.audio import Pipeline
 from librosa import load, resample
 from rpunct import RestorePuncts
 asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
 processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
 asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
+speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")
 rpunct = RestorePuncts()
 	speech, sampling_rate = load(filepath)
 	if sampling_rate != 16000:
 		speech = resample(speech, sampling_rate, 16000)
+	speaker_output = speaker_segmentation(speech)
+	text = asr(speech, return_timestamps="word")
+	full_text = text['text'].lower()
+	chunks = text['chunks']
+	diarizaed_output = ""
+	i = 0
+	for turn, _, speaker in speaker_output.itertracks(yield_label=True):
+		diarized = ""
+		while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
+			diarized += chunks[i]['text'].lower() + ' '
+			i += 1
+		if diarized != "":
+			diarized = rpunct.punctuate(diarized)
+			diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
+	return diarizaed_output, full_text
 mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
+diarized_transcript = gr.outputs.Textbox(type='auto', label='Diarized Output')
+full_transcript = gr.outputs.Textbox(type='auto', label='Full Transcript')
 iface = gr.Interface(
 	theme='huggingface',
 	description='Testing transcription',
 	fn=transcribe,
 	inputs=[mic],
+	outputs=[diarized_transcript, full_transcript]
 )
 iface.launch()

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ librosa
 pyctcdecode
 pypi-kenlm
 git+https://github.com/anuragshas/rpunct.git

 pyctcdecode
 pypi-kenlm
 git+https://github.com/anuragshas/rpunct.git
+https://github.com/pyannote/pyannote-audio/archive/develop.zip