# import whisper | |
import gradio as gr | |
import datetime | |
import subprocess | |
import wave | |
import contextlib | |
# import torch | |
# import pyannote.audio | |
# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding | |
# from pyannote.audio import Audio | |
# from pyannote.core import Segment | |
# from sklearn.cluster import AgglomerativeClustering | |
# import numpy as np | |
# model = whisper.load_model("large-v2") | |
# embedding_model = PretrainedSpeakerEmbedding( | |
# "speechbrain/spkrec-ecapa-voxceleb", | |
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# ) | |
# def transcribe(audio, num_speakers): | |
# path, error = convert_to_wav(audio) | |
# if error is not None: | |
# return error | |
# duration = get_duration(path) | |
# if duration > 4 * 60 * 60: | |
# return "Audio duration too long" | |
# result = model.transcribe(path) | |
# segments = result["segments"] | |
# num_speakers = min(max(round(num_speakers), 1), len(segments)) | |
# if len(segments) == 1: | |
# segments[0]['speaker'] = 'SPEAKER 1' | |
# else: | |
# embeddings = make_embeddings(path, segments, duration) | |
# add_speaker_labels(segments, embeddings, num_speakers) | |
# output = get_output(segments) | |
# return output | |
# def convert_to_wav(path): | |
# if path[-3:] != 'wav': | |
# new_path = '.'.join(path.split('.')[:-1]) + '.wav' | |
# try: | |
# subprocess.call(['ffmpeg', '-i', path, new_path, '-y']) | |
# except: | |
# return path, 'Error: Could not convert file to .wav' | |
# path = new_path | |
# return path, None | |
# def get_duration(path): | |
# with contextlib.closing(wave.open(path,'r')) as f: | |
# frames = f.getnframes() | |
# rate = f.getframerate() | |
# return frames / float(rate) | |
# def make_embeddings(path, segments, duration): | |
# embeddings = np.zeros(shape=(len(segments), 192)) | |
# for i, segment in enumerate(segments): | |
# embeddings[i] = segment_embedding(path, segment, duration) | |
# return np.nan_to_num(embeddings) | |
# audio = Audio() | |
# def segment_embedding(path, segment, duration): | |
# start = segment["start"] | |
# # Whisper overshoots the end timestamp in the last segment | |
# end = min(duration, segment["end"]) | |
# clip = Segment(start, end) | |
# waveform, sample_rate = audio.crop(path, clip) | |
# return embedding_model(waveform[None]) | |
# def add_speaker_labels(segments, embeddings, num_speakers): | |
# clustering = AgglomerativeClustering(num_speakers).fit(embeddings) | |
# labels = clustering.labels_ | |
# for i in range(len(segments)): | |
# segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) | |
# def time(secs): | |
# return datetime.timedelta(seconds=round(secs)) | |
# def get_output(segments): | |
# output = '' | |
# for (i, segment) in enumerate(segments): | |
# if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: | |
# if i != 0: | |
# output += '\n\n' | |
# output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n' | |
# output += segment["text"][1:] + ' ' | |
# return output | |
s = "" | |
def greet1(name): | |
global s | |
s = "modified" | |
return "Hello " + name + "!" | |
def greet2(name): | |
return "Hi " + name + "!" + " " + s | |
def greet3(name): | |
return "Hola " + name + "!" | |
with gr.Blocks() as demo: | |
with gr.Box(): | |
with gr.Row(): | |
with gr.Column(): | |
audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1) | |
# name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove | |
number_of_speakers = gr.Number(label="Number of Speakers", value=2) | |
with gr.Row(): | |
btn_clear = gr.Button(value="Clear") | |
btn_submit = gr.Button(value="Submit") | |
with gr.Column(): | |
title = gr.Textbox(label="Title", placeholder="Title for Conversation") | |
short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation") | |
sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation") | |
quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation") | |
detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation") | |
gr.Markdown("## Examples") | |
gr.Examples( | |
examples=[ | |
[ | |
"Harsh", | |
2, | |
], | |
[ | |
"Rahul", | |
2, | |
], | |
], | |
inputs=[title], | |
outputs=[short_summary], | |
fn=greet1, | |
cache_examples=True, | |
) | |
gr.Markdown( | |
""" | |
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft) | |
for more details. | |
""" | |
) | |
demo.launch() | |