Spaces:

sergiomar73
/

qc-nlp-002-transcription-classifier-with-gpt3

Sleeping

qc-nlp-002-transcription-classifier-with-gpt3

File size: 8,037 Bytes

# https://huggingface.co/tasks/token-classification
# https://huggingface.co/spacy/en_core_web_sm
# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl

import gradio as gr
import os
import time
import openai
import numpy as np
import pandas as pd
import pickle5 as pickle
import spacy
import en_core_web_sm
import plotly.express as px

openai.organization = os.environ.get('ORGANIZATION')
openai.api_key = os.environ.get('API_KEY')

df_phrases_path = './df_phrases.pkl'
df_phrases = pd.read_pickle(df_phrases_path)
# print(df_phrases.shape)
# df_phrases.head(3)

nlp = spacy.load("en_core_web_sm")

def transcript_to_sentences(transcript):
  doc = nlp(transcript)
  sentences = [ sentence.text for sentence in list(doc.sents) ]
  # print(sentences[:3])
  return sentences
 
def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
  if verbose:
    print(f'Calculating embedding for {text}...')
  time.sleep(interval)
  response = openai.Embedding.create(
    input=text,
    engine=engine
  )
  embedding = response['data'][0]['embedding']
  return embedding
 
def quantified_classification(transcript, threshold):
  
  df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
  for idx, sentence in enumerate(transcript_to_sentences(transcript)):
    embedding = calculate_embeddings_with_gpt3(sentence)
    # Create new row
    new_row = {
      'line': idx + 1,
      'sentence': sentence,
      'embedding': embedding
    }
    df_sentences = df_sentences.append(new_row, ignore_index=True)
  # print(df_sentences.shape)
  # df_sentences.head()
  
  targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
  # print(f"targets:{targets.shape}")
  df_cosines = pd.DataFrame(columns=['line'])

  for i, row in df_sentences.iterrows():
    line = f'{row["line"]:03}'
    # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
    source = np.array(row["embedding"])
    cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
    # Create new row
    new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
    new_row["line"] = row["line"]
    df_cosines = df_cosines.append(new_row, ignore_index=True)

  df_cosines['line'] = df_cosines['line'].astype('int')
  # print(df_cosines.shape)
  # df_cosines.head(3)

  df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
  # print(df_comparison.shape)
  # df_comparison.head(3)

  threshold = threshold / 100

  df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])

  for i, row in df_comparison.iterrows():
    for n in range(1,64+1):
      col = f"Cosine{f'{n:02}'}"
      # if row[col] > threshold:
      phrase = df_phrases.loc[[ n - 1 ]]
      new_row = { 
        'line': row["line"],
        'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
        'phrase': df_phrases.at[n-1,"example"],
        'category': df_phrases.at[n-1,"category"],
        'tag': df_phrases.at[n-1,"label"],
        'similarity': row[col]
      }
      df_results = df_results.append(new_row, ignore_index=True)

  df_results['line'] = df_cosines['line'].astype('int')
  # print(df_results.shape)
  # df_results.head(3)

  df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
  df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
  # df_summary

  fig = px.bar(
    df_summary,
    y='similarity',
    color='ok',
    color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
    text='similarity',
    text_auto='.3f',
    labels={'tag': 'Category', 'similarity': 'Similarity'},
    title = f"{transcript[:200]}..."
  )
  fig.add_shape( # add a horizontal "target" line
    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
    x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
  )
  fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
  fig.update_yaxes(range=[0, 1])  
  # fig.show()

  details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
  
  res = df_summary['similarity'].to_dict()

  return res, fig, details

# Gradio UI

with gr.Blocks(css=".gradio-container { background-color: white; background-image: url('file=./qc-logo.png'); background-size: 75px 75px; background-repeat: no-repeat; background-position: 0px 0px; }") as demo:
  gr.Markdown(f"# {'&nbsp;' * 8}Transcript classifier with GPT-3")
  with gr.Row():
    transcript = gr.Textbox(lines=3, label="Transcript", placeholder="Transcript Here...")
  with gr.Row():
    threshold = gr.Slider(0, 100, 80)
  btn = gr.Button(value="Analyze!", variant="primary")
  with gr.Row():
    label = gr.Label()
    plot = gr.Plot()
  with gr.Row():
    grid = gr.Dataframe(wrap=True)
  btn.click(fn=quantified_classification, inputs=[transcript,threshold], outputs=[label,plot,grid])
  gr.Examples(
    [
      [ "Oh, so the quantified platform is one of the most advanced communication intelligence in AI powered coaching systems. And what does that really mean? So, um, communication coaching is something that is typically delivered one on one between a communication coach who has a, uh, a doctorate or a, um, background and experience in teaching people how to be better communicators and how to express themselves effectively. Um, those coaches would work one-on-one with individuals, um, maybe put their information in front of audiences and see how well they respond. And that can be a very costly process as well as a time consuming. And, um, not always backed by the science of what really drives great communication. Thank you very much.", 80 ],
      [ "So you can go from where you are today to being exceptional in the way that you communicate and speak. Who's helped most by quantified. Well, everybody communicates all day as part of their jobs. We actually study that 80% of your time at work is spent communicating. So who's helped most anyone that talks to customers, anyone that talks to other team members, anyone that talks to people for a living is going to be helped the most, really the more that you communicate as a critical component of your job, the more you're gonna be helped. Finally, how can quantified have the greatest impact on my organization?", 80 ],
      [ "It's hard to find time for managers to coach, and it is hard for us to give feedback. It's hard for people to feel empowered, to work on something that is personal and private in a safe space. Uh, how can it have the greatest impact, uh, customer experience communicating with the customer, spending time spending a lot of time, communicating with other members of your team, internal communication and external communication. Uh, we wanna make you remarkably better. We want to make you extraordinary at that behavior.", 80 ],
      [ "So you'll have a good understanding of how you come across a relay back to exactly how you're doing, how you can get better. Um, and coach you using that artificial intelligence. It's able to give you that objective feedback. It's gonna be exactly as if you have the world's best communications coach was sitting, um, there with you in every one of your conversations, telling you how to get better and telling you how to optimize your behavior, who does it help? So literally anyone that has conversations, if you're an entry level person, um, or if you're the most senior executive, you will benefit from our platform, the most powerful group that we can help our customer facing teams, that's sales teams, customer service teams, customer support, and customer success team.", 80 ]
    ],
    [transcript, threshold],
    fn=quantified_classification
  )

demo.launch(debug=True)