Spaces:

Cloudy1225
/

stackoverflow-sentiment-analysis

Sleeping

App Files Files Community

LiuYunhui commited on Jun 3, 2023

Commit

08db26d

1 Parent(s): 5f1632a

Add application file

Browse files

Files changed (6) hide show

README.md +3 -1
SOF4423.csv +0 -0
__pycache__/sentiment_analyser.cpython-310.pyc +0 -0
app.py +115 -0
requirements.txt +6 -0
sentiment_analyser.py +84 -0

README.md CHANGED Viewed

@@ -10,4 +10,6 @@ pinned: false
 license: openrail
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: openrail
 ---
+# Sentiment Analysis on Software Engineer Texts
+This is a demo for our fine-tuned model [stackoverflow-roberta-base-sentiment](https://huggingface.co/Cloudy1225/stackoverflow-roberta-base-sentiment).

SOF4423.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

__pycache__/sentiment_analyser.cpython-310.pyc ADDED Viewed

Binary file (3.52 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import gradio as gr
+import pandas as pd
+from sentiment_analyser import RandomAnalyser, RoBERTaAnalyser, ChatGPTAnalyser
+import matplotlib.pyplot as plt
+from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+def plot_bar(value_counts):
+    fig, ax = plt.subplots(figsize=(6, 6))
+    value_counts.plot.barh(ax=ax)
+    ax.bar_label(ax.containers[0])
+    plt.title('Frequency of Predictions')
+    return fig
+def plot_confusion_matrix(y_pred, y_true):
+    cm = confusion_matrix(y_true, y_pred, normalize='true')
+    fig, ax = plt.subplots(figsize=(6, 6))
+    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
+                                  display_labels=['negative', 'neutral', 'positive'])
+    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
+    plt.title("Normalized Confusion Matrix")
+    return fig
+def classify(num: int):
+    samples_df = df.sample(num)
+    X = samples_df['Text'].tolist()
+    y = samples_df['Label']
+    roberta = MODEL_MAPPING[OUR_MODEL]
+    y_pred = pd.Series(roberta.predict(X), index=samples_df.index)
+    samples_df['Predict'] = y_pred
+    bar = plot_bar(y_pred.value_counts())
+    cm = plot_confusion_matrix(y_pred, y)
+    return samples_df, bar, cm
+def analysis(Text):
+    keys = []
+    values = []
+    for name, model in MODEL_MAPPING.items():
+        keys.append(name)
+        values.append(SENTI_MAPPING[model.predict([Text])[0]])
+    return pd.DataFrame([values], columns=keys)
+MODEL_MAPPING = {
+    'Random': RandomAnalyser(),
+    'RoBERTa': RoBERTaAnalyser(),
+    'ChatGPT': ChatGPTAnalyser(),
+}
+OUR_MODEL = 'RoBERTa'
+SENTI_MAPPING = {
+    'negative': '😭',
+    'neutral': '😶',
+    'positive': '🥰'
+}
+TITLE = "Sentiment Analysis on Software Engineer Texts"
+DESCRIPTION = (
+    "这里是第16组“睿王和他的五个小跟班”软工三迭代三模型演示页面。"
+    "模型链接：[Cloudy1225/stackoverflow-roberta-base-sentiment]"
+    "(https://huggingface.co/Cloudy1225/stackoverflow-roberta-base-sentiment) "
+)
+MAX_SAMPLES = 64
+df = pd.read_csv('./SOF4423.csv')
+with gr.Blocks(title=TITLE) as demo:
+    gr.HTML(f"<H1>{TITLE}</H1>")
+    gr.Markdown(DESCRIPTION)
+    gr.HTML("<H2>Model Inference</H2>")
+    gr.Markdown((
+        "在左侧文本框中输入文本并按回车键，右侧将输出情感分析结果。"
+        "这里我们展示了三种结果，分别是随机结果、模型结果和 ChatGPT 结果。"
+    ))
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(label='Input',
+                                    placeholder="Enter a positive or negative sentence here...")
+        with gr.Column():
+            senti_output = gr.Dataframe(type="pandas", value=[['😋', '😋', '😋']],
+                                        headers=list(MODEL_MAPPING.keys()), interactive=False)
+    text_input.submit(analysis, inputs=text_input, outputs=senti_output, show_progress=True)
+    gr.HTML("<H2>Model Evaluation</H2>")
+    gr.Markdown((
+        "这里是在 StackOverflow4423 数据集上评估我们的模型。"
+        "滑动 Slider，将会从 StackOverflow4423 数据集中抽样出指定数量的样本，预测其情感标签。"
+        "并根据预测结果绘制标签分布图和混淆矩阵。"
+    ))
+    input_models = list(MODEL_MAPPING)
+    input_n_samples = gr.Slider(
+        minimum=4,
+        maximum=MAX_SAMPLES,
+        value=8,
+        step=4,
+        label='Number of samples'
+    )
+    with gr.Row():
+        with gr.Column():
+            bar_plot = gr.Plot(label='Predictions Frequency')
+        with gr.Column():
+            cm_plot = gr.Plot(label='Confusion Matrix')
+    with gr.Row():
+        dataframe = gr.Dataframe(type="pandas", wrap=True)
+    input_n_samples.change(fn=classify, inputs=input_n_samples, outputs=[dataframe, bar_plot, cm_plot])
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+gradio
+openai
+matplotlib
+transformers
+scikit-learn

sentiment_analyser.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import time
+import openai
+import random
+from transformers import pipeline
+class RandomAnalyser:
+    def __init__(self):
+        self.LABELS = ['negative', 'neutral', 'positive']
+    def predict(self, X: list):
+        return [random.choice(self.LABELS) for x in X]
+class RoBERTaAnalyser:
+    def __init__(self):
+        self.analyser = pipeline(task="sentiment-analysis", model="Cloudy1225/stackoverflow-roberta-base-sentiment")
+    def predict(self, X: list):
+        sentiments = []
+        for x in X:
+            x = RoBERTaAnalyser.preprocess(x)
+            prediction = self.analyser(x)
+            sentiments.append(prediction[0]['label'])
+        return sentiments
+    @staticmethod
+    def preprocess(text):
+        """Preprocess text (username and link placeholders, remove line breaks)"""
+        new_text = []
+        for t in text.split(' '):
+            t = '@user' if t.startswith('@') and len(t) > 1 else t
+            t = 'http' if t.startswith('http') else t
+            new_text.append(t)
+        return ' '.join(new_text).strip()
+class ChatGPTAnalyser:
+    def __init__(self):
+        # import os
+        # os.environ["http_proxy"] = "http://127.0.0.1:10080"
+        # os.environ["https_proxy"] = "http://127.0.0.1:10080"
+        self.MODEL = "gpt-3.5-turbo"
+        self.KEYs = [
+            "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",
+            "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",
+            "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",
+            "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",
+            "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",
+            "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"
+        ]
+        self.TASK_NAME = 'Sentiment Classification'
+        self.TASK_DEFINITION = 'Given the sentence, assign a sentiment label from [negative, neutral, positive].'
+        self.OUT_FORMAT = 'Return label only without any other text.'
+        self.PROMPT_PREFIX = f"Please perform {self.TASK_NAME} task.{self.TASK_DEFINITION}{self.OUT_FORMAT}\nSentence:\n{{}}\nLabel:"
+    def predict(self, X: list):
+        sentiments = []
+        for i in range(len(X)):
+            prompt = self.PROMPT_PREFIX.format(X[i])
+            messages = [{"role": "user", "content": prompt}]
+            # openai.api_key = self.KEYs[i % len(self.KEYs)]
+            openai.api_key = random.choice(self.KEYs)
+            while True:
+                try:
+                    response = openai.ChatCompletion.create(
+                        model=self.MODEL,
+                        messages=messages,
+                        temperature=0,
+                        n=1,
+                        stop=None
+                    )
+                    sentiment = response.choices[0].message.content
+                    sentiments.append(sentiment.strip().lower())
+                    break
+                except openai.error.RateLimitError:
+                    sleep_snds = 60
+                    time.sleep(sleep_snds)
+                    continue
+                except openai.error.APIError:
+                    sleep_snds = 60
+                    time.sleep(sleep_snds)
+                    continue
+        return sentiments