sergiomar73 commited on
Commit
2ca06ea
1 Parent(s): 1130b2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/tasks/token-classification
2
+ # https://huggingface.co/spacy/en_core_web_sm
3
+ # pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
4
+
5
+ import gradio as gr
6
+ import os
7
+ import time
8
+ import openai
9
+ import numpy as np
10
+ import pandas as pd
11
+ import spacy
12
+ import en_core_web_sm
13
+ import plotly.express as px
14
+
15
+ openai.organization = os.environ.get('ORGANIZATION')
16
+ openai.api_key = os.environ.get('API_KEY')
17
+
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ def text_to_sentences(text):
21
+ doc = nlp(text)
22
+ sentences = [ sentence.text for sentence in list(doc.sents) ]
23
+ # print(sentences[:3])
24
+ return sentences
25
+
26
+ def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
27
+ if verbose:
28
+ print(f'Calculating embedding for {text}...')
29
+ time.sleep(interval)
30
+ response = openai.Embedding.create(
31
+ input=text,
32
+ engine=engine
33
+ )
34
+ embedding = response['data'][0]['embedding']
35
+ return embedding
36
+
37
+ def gpt3_zero_shot_classification(text, labels):
38
+
39
+ df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
40
+ for idx, sentence in enumerate(text_to_sentences(text)):
41
+ embedding = calculate_embeddings_with_gpt3(sentence)
42
+ # Create new row
43
+ new_row = {
44
+ 'line': idx + 1,
45
+ 'sentence': sentence,
46
+ 'embedding': embedding
47
+ }
48
+ df_sentences = df_sentences.append(new_row, ignore_index=True)
49
+ # print(df_sentences.shape)
50
+ # df_sentences.head()
51
+
52
+
53
+
54
+ targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
55
+ # print(f"targets:{targets.shape}")
56
+ df_cosines = pd.DataFrame(columns=['line'])
57
+
58
+ for i, row in df_sentences.iterrows():
59
+ line = f'{row["line"]:03}'
60
+ # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
61
+ source = np.array(row["embedding"])
62
+ cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
63
+ # Create new row
64
+ new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
65
+ new_row["line"] = row["line"]
66
+ df_cosines = df_cosines.append(new_row, ignore_index=True)
67
+
68
+ df_cosines['line'] = df_cosines['line'].astype('int')
69
+ # print(df_cosines.shape)
70
+ # df_cosines.head(3)
71
+
72
+ df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
73
+ # print(df_comparison.shape)
74
+ # df_comparison.head(3)
75
+
76
+ threshold = threshold / 100
77
+
78
+ df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
79
+
80
+ for i, row in df_comparison.iterrows():
81
+ for n in range(1,64+1):
82
+ col = f"Cosine{f'{n:02}'}"
83
+ # if row[col] > threshold:
84
+ phrase = df_phrases.loc[[ n - 1 ]]
85
+ new_row = {
86
+ 'line': row["line"],
87
+ 'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
88
+ 'phrase': df_phrases.at[n-1,"example"],
89
+ 'category': df_phrases.at[n-1,"category"],
90
+ 'tag': df_phrases.at[n-1,"label"],
91
+ 'similarity': row[col]
92
+ }
93
+ df_results = df_results.append(new_row, ignore_index=True)
94
+
95
+ df_results['line'] = df_cosines['line'].astype('int')
96
+ # print(df_results.shape)
97
+ # df_results.head(3)
98
+
99
+ df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
100
+ df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
101
+ # df_summary
102
+
103
+ fig = px.bar(
104
+ df_summary,
105
+ y='similarity',
106
+ color='ok',
107
+ color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
108
+ text='similarity',
109
+ text_auto='.3f',
110
+ labels={'tag': 'Category', 'similarity': 'Similarity'},
111
+ title = f"{text[:200]}..."
112
+ )
113
+ fig.add_shape( # add a horizontal "target" line
114
+ type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
115
+ x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
116
+ )
117
+ fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
118
+ fig.update_yaxes(range=[0, 1])
119
+ # fig.show()
120
+
121
+ details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1)
122
+
123
+ res = df_summary['similarity'].to_dict()
124
+
125
+ return res, fig, details
126
+
127
+ # Gradio UI
128
+
129
+ with gr.Blocks(css=".gradio-container { background-color: white; }") as demo:
130
+ gr.Markdown(f"# GPT-3 Zero shot classification app")
131
+ with gr.Row():
132
+ context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...")
133
+ with gr.Row():
134
+ threshold = gr.Slider(0, 100, 80)
135
+ btn = gr.Button(value="Analyze!", variant="primary")
136
+ with gr.Row():
137
+ label = gr.Label()
138
+ plot = gr.Plot()
139
+ with gr.Row():
140
+ grid = gr.Dataframe(wrap=True)
141
+ btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid])
142
+ gr.Examples(
143
+ [
144
+ [ "", "Entertainment, Business, Politics" ],
145
+ [ "", "Entertainment, Business, Politics" ],
146
+ [ "", "Entertainment, Business, Politics" ],
147
+ [ "", "Entertainment, Business, Politics" ]
148
+ ],
149
+ [context, threshold],
150
+ fn=gpt3_zero_shot_classification
151
+ )
152
+
153
+ demo.launch(debug=True)