# import whisper import gradio as gr import datetime import subprocess import wave import contextlib # import torch # import pyannote.audio # from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding # from pyannote.audio import Audio # from pyannote.core import Segment # from sklearn.cluster import AgglomerativeClustering # import numpy as np # model = whisper.load_model("large-v2") # embedding_model = PretrainedSpeakerEmbedding( # "speechbrain/spkrec-ecapa-voxceleb", # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # ) # def transcribe(audio, num_speakers): # path, error = convert_to_wav(audio) # if error is not None: # return error # duration = get_duration(path) # if duration > 4 * 60 * 60: # return "Audio duration too long" # result = model.transcribe(path) # segments = result["segments"] # num_speakers = min(max(round(num_speakers), 1), len(segments)) # if len(segments) == 1: # segments[0]['speaker'] = 'SPEAKER 1' # else: # embeddings = make_embeddings(path, segments, duration) # add_speaker_labels(segments, embeddings, num_speakers) # output = get_output(segments) # return output # def convert_to_wav(path): # if path[-3:] != 'wav': # new_path = '.'.join(path.split('.')[:-1]) + '.wav' # try: # subprocess.call(['ffmpeg', '-i', path, new_path, '-y']) # except: # return path, 'Error: Could not convert file to .wav' # path = new_path # return path, None # def get_duration(path): # with contextlib.closing(wave.open(path,'r')) as f: # frames = f.getnframes() # rate = f.getframerate() # return frames / float(rate) # def make_embeddings(path, segments, duration): # embeddings = np.zeros(shape=(len(segments), 192)) # for i, segment in enumerate(segments): # embeddings[i] = segment_embedding(path, segment, duration) # return np.nan_to_num(embeddings) # audio = Audio() # def segment_embedding(path, segment, duration): # start = segment["start"] # # Whisper overshoots the end timestamp in the last segment # end = min(duration, segment["end"]) # clip = Segment(start, end) # waveform, sample_rate = audio.crop(path, clip) # return embedding_model(waveform[None]) # def add_speaker_labels(segments, embeddings, num_speakers): # clustering = AgglomerativeClustering(num_speakers).fit(embeddings) # labels = clustering.labels_ # for i in range(len(segments)): # segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) # def time(secs): # return datetime.timedelta(seconds=round(secs)) # def get_output(segments): # output = '' # for (i, segment) in enumerate(segments): # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: # if i != 0: # output += '\n\n' # output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n' # output += segment["text"][1:] + ' ' # return output s = "" def greet1(name): global s s = "modified" return "Hello " + name + "!" def greet2(name): return "Hi " + name + "!" + " " + s def greet3(name): return "Hola " + name + "!" with gr.Blocks() as demo: with gr.Box(): with gr.Row(): with gr.Column(): audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1) # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove number_of_speakers = gr.Number(label="Number of Speakers", value=2) with gr.Row(): btn_clear = gr.Button(value="Clear") btn_submit = gr.Button(value="Submit") with gr.Column(): title = gr.Textbox(label="Title", placeholder="Title for Conversation") short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation") sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation") quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation") detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation") gr.Markdown("## Examples") gr.Examples( examples=[ [ "Harsh", 2, ], [ "Rahul", 2, ], ], inputs=[title], outputs=[short_summary], fn=greet1, cache_examples=True, ) gr.Markdown( """ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft) for more details. """ ) demo.launch()