import os import wave import nltk import torch import torch import openai import whisper import textstat import datetime import requests import subprocess import contextlib import numpy as np import gradio as gr from pyannote.audio import Audio from pyannote.core import Segment from sklearn.cluster import AgglomerativeClustering from nltk.sentiment.vader import SentimentIntensityAnalyzer from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding embedding_model = PretrainedSpeakerEmbedding( "speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cpu")) nltk.download('vader_lexicon') sid = SentimentIntensityAnalyzer() model = whisper.load_model('base') audio = Audio() openai.api_key = os.environ['OPEN_AI_API_KEY'] example_files = [ "https://pdf.bluetickconsultants.com/e-commerce-call.mp3", "https://pdf.bluetickconsultants.com/customer_support.mp3", "https://pdf.bluetickconsultants.com/product_refund.mp3", ] file_names = [] def download_file(url, save_name): url = url if not os.path.exists(save_name): file = requests.get(url) open(save_name, 'wb').write(file.content) for url in example_files: save_name = str(url).split("/")[-1] download_file(url, str(url).split("/")[-1]) file_names.append([save_name, 2]) def segment_embedding(segment, duration, audio_file): start = segment["start"] # Whisper overshoots the end timestamp in the last segment end = min(duration, segment["end"]) clip = Segment(start, end) waveform, sample_rate = audio.crop(audio_file, clip) waveform = waveform.mean(dim=0, keepdim=True) return embedding_model(waveform.unsqueeze(0)) def speech_to_text_and_sentiment(audio_file, number_of_speakers=2): if audio_file[-3:] != 'wav': audio_file_name = audio_file.split("/")[-1] audio_file_name = audio_file_name.split(".")[0] + ".wav" subprocess.call(['ffmpeg', '-i', audio_file, audio_file_name, '-y']) audio_file = audio_file_name result = model.transcribe(audio_file) segments = result["segments"] with contextlib.closing(wave.open(audio_file, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) embeddings = np.zeros(shape=(len(segments), 192)) for i, segment in enumerate(segments): embeddings[i] = segment_embedding(segment, duration, audio_file) embeddings = np.nan_to_num(embeddings) clustering = AgglomerativeClustering( int(number_of_speakers)).fit(embeddings) labels = clustering.labels_ for i in range(len(segments)): segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) def time(secs): return datetime.timedelta(seconds=round(secs)) conv = "" for (i, segment) in enumerate(segments): if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: conv += "\n" + segment["speaker"] + ' ' + \ str(time(segment["start"])) + '\n' conv += segment["text"][1:] + ' ' sentiment_scores = sid.polarity_scores(conv) messages = [ { "role": "system", "content": """You will be provided with a conversation. Your task is to give a summary and mention all the main details in bullet points. Replace speaker 1 and speaker 2 with sales excutive or comapny name and customer name if available. """ }, { "role": "user", "content": conv } ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=0, presence_penalty=0 ) readability_score = textstat.flesch_reading_ease(conv) call_summary = "" call_summary += "Sentiment Analysis:\n" + "-------------------------------------\n" call_summary += f"Positive: {sentiment_scores['pos']} | Negative: {sentiment_scores['neg']} | Neutral: {sentiment_scores['neu']}\n\n" call_summary += "Readability/ Clarity of speach:\n" + "-------------------------------------\n" call_summary += f"Readability Score (Flesch-Kincaid): {readability_score}\n\n" call_summary += "Call Summary:\n" + "-------------------------------------\n" call_summary += response["choices"][0]["message"]["content"] return call_summary, conv demo = gr.Interface( title="Bluetick Sales Call Evaluator", description="Upload a sales call audio file and get a transcription of the call along with sentiment analysis", fn=speech_to_text_and_sentiment, inputs=[ gr.Audio(label="Select audio file", type="filepath"), gr.Number(label="Select number of speakers (1-5)", default=2, type="number", min=1, max=5) ], outputs=[ gr.Textbox(label="Analysis & Summary"), gr.Textbox(label="Transcript"), ], examples=file_names, theme=gr.themes.Soft().set( body_text_color="black" ), css=" .gradio-container {background-color: white !important;} .prose h1{color: black !important;} p {color: black !important;}", ) demo.launch(debug=True)