Spaces:

sergiomar73
/

nlp-gpt3-zero-shot-classification-app

Sleeping

App Files Files Community

sergiomar73 commited on Sep 30, 2022

Commit

2ca06ea

•

1 Parent(s): 1130b2c

Create app.py

Browse files

Files changed (1) hide show

app.py +153 -0

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# https://huggingface.co/tasks/token-classification
+# https://huggingface.co/spacy/en_core_web_sm
+# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+import gradio as gr
+import os
+import time
+import openai
+import numpy as np
+import pandas as pd
+import spacy
+import en_core_web_sm
+import plotly.express as px
+openai.organization = os.environ.get('ORGANIZATION')
+openai.api_key = os.environ.get('API_KEY')
+nlp = spacy.load("en_core_web_sm")
+def text_to_sentences(text):
+  doc = nlp(text)
+  sentences = [ sentence.text for sentence in list(doc.sents) ]
+  # print(sentences[:3])
+  return sentences
+def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
+  if verbose:
+    print(f'Calculating embedding for {text}...')
+  time.sleep(interval)
+  response = openai.Embedding.create(
+    input=text,
+    engine=engine
+  )
+  embedding = response['data'][0]['embedding']
+  return embedding
+def gpt3_zero_shot_classification(text, labels):
+  df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
+  for idx, sentence in enumerate(text_to_sentences(text)):
+    embedding = calculate_embeddings_with_gpt3(sentence)
+    # Create new row
+    new_row = {
+      'line': idx + 1,
+      'sentence': sentence,
+      'embedding': embedding
+    }
+    df_sentences = df_sentences.append(new_row, ignore_index=True)
+  # print(df_sentences.shape)
+  # df_sentences.head()
+  targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
+  # print(f"targets:{targets.shape}")
+  df_cosines = pd.DataFrame(columns=['line'])
+  for i, row in df_sentences.iterrows():
+    line = f'{row["line"]:03}'
+    # print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
+    source = np.array(row["embedding"])
+    cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
+    # Create new row
+    new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
+    new_row["line"] = row["line"]
+    df_cosines = df_cosines.append(new_row, ignore_index=True)
+  df_cosines['line'] = df_cosines['line'].astype('int')
+  # print(df_cosines.shape)
+  # df_cosines.head(3)
+  df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
+  # print(df_comparison.shape)
+  # df_comparison.head(3)
+  threshold = threshold / 100
+  df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
+  for i, row in df_comparison.iterrows():
+    for n in range(1,64+1):
+      col = f"Cosine{f'{n:02}'}"
+      # if row[col] > threshold:
+      phrase = df_phrases.loc[[ n - 1 ]]
+      new_row = {
+        'line': row["line"],
+        'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
+        'phrase': df_phrases.at[n-1,"example"],
+        'category': df_phrases.at[n-1,"category"],
+        'tag': df_phrases.at[n-1,"label"],
+        'similarity': row[col]
+      }
+      df_results = df_results.append(new_row, ignore_index=True)
+  df_results['line'] = df_cosines['line'].astype('int')
+  # print(df_results.shape)
+  # df_results.head(3)
+  df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
+  df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
+  # df_summary
+  fig = px.bar(
+    df_summary,
+    y='similarity',
+    color='ok',
+    color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
+    text='similarity',
+    text_auto='.3f',
+    labels={'tag': 'Category', 'similarity': 'Similarity'},
+    title = f"{text[:200]}..."
+  )
+  fig.add_shape( # add a horizontal "target" line
+    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
+    x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
+  )
+  fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
+  fig.update_yaxes(range=[0, 1])
+  # fig.show()
+  details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index()    .drop(labels='index',axis=1)
+  res = df_summary['similarity'].to_dict()
+  return res, fig, details
+# Gradio UI
+with gr.Blocks(css=".gradio-container { background-color: white; }") as demo:
+  gr.Markdown(f"# GPT-3 Zero shot classification app")
+  with gr.Row():
+    context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...")
+  with gr.Row():
+    threshold = gr.Slider(0, 100, 80)
+  btn = gr.Button(value="Analyze!", variant="primary")
+  with gr.Row():
+    label = gr.Label()
+    plot = gr.Plot()
+  with gr.Row():
+    grid = gr.Dataframe(wrap=True)
+  btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid])
+  gr.Examples(
+    [
+      [ "", "Entertainment, Business, Politics" ],
+      [ "", "Entertainment, Business, Politics" ],
+      [ "", "Entertainment, Business, Politics" ],
+      [ "", "Entertainment, Business, Politics" ]
+    ],
+    [context, threshold],
+    fn=gpt3_zero_shot_classification
+  )
+demo.launch(debug=True)