akashkumarbtc's picture
Update app.py
57e04eb
import os
import wave
import nltk
import torch
import torch
import openai
import whisper
import textstat
import datetime
import requests
import subprocess
import contextlib
import numpy as np
import gradio as gr
from pyannote.audio import Audio
from pyannote.core import Segment
from sklearn.cluster import AgglomerativeClustering
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cpu"))
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
model = whisper.load_model('base')
audio = Audio()
openai.api_key = os.environ['OPEN_AI_API_KEY']
example_files = [
"https://pdf.bluetickconsultants.com/e-commerce-call.mp3",
"https://pdf.bluetickconsultants.com/customer_support.mp3",
"https://pdf.bluetickconsultants.com/product_refund.mp3",
]
file_names = []
def download_file(url, save_name):
url = url
if not os.path.exists(save_name):
file = requests.get(url)
open(save_name, 'wb').write(file.content)
for url in example_files:
save_name = str(url).split("/")[-1]
download_file(url, str(url).split("/")[-1])
file_names.append([save_name, 2])
def segment_embedding(segment, duration, audio_file):
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(audio_file, clip)
waveform = waveform.mean(dim=0, keepdim=True)
return embedding_model(waveform.unsqueeze(0))
def speech_to_text_and_sentiment(audio_file, number_of_speakers=2):
if audio_file[-3:] != 'wav':
audio_file_name = audio_file.split("/")[-1]
audio_file_name = audio_file_name.split(".")[0] + ".wav"
subprocess.call(['ffmpeg', '-i', audio_file, audio_file_name, '-y'])
audio_file = audio_file_name
result = model.transcribe(audio_file)
segments = result["segments"]
with contextlib.closing(wave.open(audio_file, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment, duration, audio_file)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(
int(number_of_speakers)).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
def time(secs):
return datetime.timedelta(seconds=round(secs))
conv = ""
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
conv += "\n" + segment["speaker"] + ' ' + \
str(time(segment["start"])) + '\n'
conv += segment["text"][1:] + ' '
sentiment_scores = sid.polarity_scores(conv)
messages = [
{
"role": "system",
"content": """You will be provided with a conversation. Your task is to give a summary and mention all the main details in bullet points.
Replace speaker 1 and speaker 2 with sales excutive or comapny name and customer name if available.
"""
},
{
"role": "user",
"content": conv
}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
readability_score = textstat.flesch_reading_ease(conv)
call_summary = ""
call_summary += "Sentiment Analysis:\n" + "-------------------------------------\n"
call_summary += f"Positive: {sentiment_scores['pos']} | Negative: {sentiment_scores['neg']} | Neutral: {sentiment_scores['neu']}\n\n"
call_summary += "Readability/ Clarity of speach:\n" + "-------------------------------------\n"
call_summary += f"Readability Score (Flesch-Kincaid): {readability_score}\n\n"
call_summary += "Call Summary:\n" + "-------------------------------------\n"
call_summary += response["choices"][0]["message"]["content"]
return call_summary, conv
demo = gr.Interface(
title="Bluetick Sales Call Evaluator",
description="Upload a sales call audio file and get a transcription of the call along with sentiment analysis",
fn=speech_to_text_and_sentiment,
inputs=[
gr.Audio(label="Select audio file", type="filepath"),
gr.Number(label="Select number of speakers (1-5)",
default=2, type="number", min=1, max=5)
],
outputs=[
gr.Textbox(label="Analysis & Summary"),
gr.Textbox(label="Transcript"),
],
examples=file_names,
theme=gr.themes.Soft().set(
body_text_color="black"
),
css=" .gradio-container {background-color: white !important;} .prose h1{color: black !important;} p {color: black !important;}",
)
demo.launch(debug=True)