File size: 8,037 Bytes
bdb2fb9
 
47434c8
bdb2fb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47434c8
bdb2fb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c09011
 
bdb2fb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c09011
 
 
 
 
 
 
 
bdb2fb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c09011
 
bdb2fb9
 
0c09011
bdb2fb9
 
623d82f
7101094
bdb2fb9
 
 
 
0c09011
 
3ea6377
e17ad0e
bdb2fb9
 
 
 
 
 
 
 
 
3ea6377
bdb2fb9
 
 
c655bb6
 
 
811fd5e
bdb2fb9
 
 
 
 
606d176
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# https://huggingface.co/tasks/token-classification
# https://huggingface.co/spacy/en_core_web_sm
# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl

import gradio as gr
import os
import time
import openai
import numpy as np
import pandas as pd
import pickle5 as pickle
import spacy
import en_core_web_sm
import plotly.express as px

openai.organization = os.environ.get('ORGANIZATION')
openai.api_key = os.environ.get('API_KEY')

df_phrases_path = './df_phrases.pkl'
df_phrases = pd.read_pickle(df_phrases_path)
# print(df_phrases.shape)
# df_phrases.head(3)

nlp = spacy.load("en_core_web_sm")

def transcript_to_sentences(transcript):
  doc = nlp(transcript)
  sentences = [ sentence.text for sentence in list(doc.sents) ]
  # print(sentences[:3])
  return sentences
 
def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
  if verbose:
    print(f'Calculating embedding for {text}...')
  time.sleep(interval)
  response = openai.Embedding.create(
    input=text,
    engine=engine
  )
  embedding = response['data'][0]['embedding']
  return embedding
 
def quantified_classification(transcript, threshold):
  
  df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
  for idx, sentence in enumerate(transcript_to_sentences(transcript)):
    embedding = calculate_embeddings_with_gpt3(sentence)
    # Create new row
    new_row = {
      'line': idx + 1,
      'sentence': sentence,
      'embedding': embedding
    }
    df_sentences = df_sentences.append(new_row, ignore_index=True)
  # print(df_sentences.shape)
  # df_sentences.head()
  
  targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
  # print(f"targets:{targets.shape}")
  df_cosines = pd.DataFrame(columns=['line'])

  for i, row in df_sentences.iterrows():
    line = f'{row["line"]:03}'
    # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
    source = np.array(row["embedding"])
    cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
    # Create new row
    new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
    new_row["line"] = row["line"]
    df_cosines = df_cosines.append(new_row, ignore_index=True)

  df_cosines['line'] = df_cosines['line'].astype('int')
  # print(df_cosines.shape)
  # df_cosines.head(3)

  df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
  # print(df_comparison.shape)
  # df_comparison.head(3)

  threshold = threshold / 100

  df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])

  for i, row in df_comparison.iterrows():
    for n in range(1,64+1):
      col = f"Cosine{f'{n:02}'}"
      # if row[col] > threshold:
      phrase = df_phrases.loc[[ n - 1 ]]
      new_row = { 
        'line': row["line"],
        'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
        'phrase': df_phrases.at[n-1,"example"],
        'category': df_phrases.at[n-1,"category"],
        'tag': df_phrases.at[n-1,"label"],
        'similarity': row[col]
      }
      df_results = df_results.append(new_row, ignore_index=True)

  df_results['line'] = df_cosines['line'].astype('int')
  # print(df_results.shape)
  # df_results.head(3)

  df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
  df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
  # df_summary

  fig = px.bar(
    df_summary,
    y='similarity',
    color='ok',
    color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
    text='similarity',
    text_auto='.3f',
    labels={'tag': 'Category', 'similarity': 'Similarity'},
    title = f"{transcript[:200]}..."
  )
  fig.add_shape( # add a horizontal "target" line
    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
    x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
  )
  fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
  fig.update_yaxes(range=[0, 1])  
  # fig.show()

  details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
  
  res = df_summary['similarity'].to_dict()

  return res, fig, details

# Gradio UI

with gr.Blocks(css=".gradio-container { background-color: white; background-image: url('file=./qc-logo.png'); background-size: 75px 75px; background-repeat: no-repeat; background-position: 0px 0px; }") as demo:
  gr.Markdown(f"# {' ' * 8}Transcript classifier with GPT-3")
  with gr.Row():
    transcript = gr.Textbox(lines=3, label="Transcript", placeholder="Transcript Here...")
  with gr.Row():
    threshold = gr.Slider(0, 100, 80)
  btn = gr.Button(value="Analyze!", variant="primary")
  with gr.Row():
    label = gr.Label()
    plot = gr.Plot()
  with gr.Row():
    grid = gr.Dataframe(wrap=True)
  btn.click(fn=quantified_classification, inputs=[transcript,threshold], outputs=[label,plot,grid])
  gr.Examples(
    [
      [ "Oh, so the quantified platform is one of the most advanced communication intelligence in AI powered coaching systems. And what does that really mean? So, um, communication coaching is something that is typically delivered one on one between a communication coach who has a, uh, a doctorate or a, um, background and experience in teaching people how to be better communicators and how to express themselves effectively. Um, those coaches would work one-on-one with individuals, um, maybe put their information in front of audiences and see how well they respond. And that can be a very costly process as well as a time consuming. And, um, not always backed by the science of what really drives great communication. Thank you very much.", 80 ],
      [ "So you can go from where you are today to being exceptional in the way that you communicate and speak. Who's helped most by quantified. Well, everybody communicates all day as part of their jobs. We actually study that 80% of your time at work is spent communicating. So who's helped most anyone that talks to customers, anyone that talks to other team members, anyone that talks to people for a living is going to be helped the most, really the more that you communicate as a critical component of your job, the more you're gonna be helped. Finally, how can quantified have the greatest impact on my organization?", 80 ],
      [ "It's hard to find time for managers to coach, and it is hard for us to give feedback. It's hard for people to feel empowered, to work on something that is personal and private in a safe space. Uh, how can it have the greatest impact, uh, customer experience communicating with the customer, spending time spending a lot of time, communicating with other members of your team, internal communication and external communication. Uh, we wanna make you remarkably better. We want to make you extraordinary at that behavior.", 80 ],
      [ "So you'll have a good understanding of how you come across a relay back to exactly how you're doing, how you can get better. Um, and coach you using that artificial intelligence. It's able to give you that objective feedback. It's gonna be exactly as if you have the world's best communications coach was sitting, um, there with you in every one of your conversations, telling you how to get better and telling you how to optimize your behavior, who does it help? So literally anyone that has conversations, if you're an entry level person, um, or if you're the most senior executive, you will benefit from our platform, the most powerful group that we can help our customer facing teams, that's sales teams, customer service teams, customer support, and customer success team.", 80 ]
    ],
    [transcript, threshold],
    fn=quantified_classification
  )

demo.launch(debug=True)