Spaces:
Sleeping
Sleeping
import torch | |
from transformers import BertTokenizerFast, BertForSequenceClassification | |
from torch.nn.functional import softmax | |
import pandas as pd | |
import gradio as gra | |
import gradio as gr | |
from itertools import islice | |
from collections import defaultdict | |
import spacy | |
import ast | |
import os | |
from datasets import load_dataset | |
# df = load_dataset('csv',data_files=['pp_selected_reviews_JJ_NN.csv) | |
df = pd.read_csv('/home/user/app/pp_selected_reviews_JJ_NN.csv') | |
try: | |
nlp = spacy.load("en_core_web_lg") | |
except: | |
script = "python -m spacy download en_core_web_lg" | |
os.system("bash -c '%s'" % script) | |
# λͺ¨λΈ λ‘λ | |
model = BertForSequenceClassification.from_pretrained('GiRak/beer-sentiment-bert') # HuggingFace μ¬μ νμ΅ λͺ¨λΈ μ λ‘λ | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = model.to(device) | |
# ν ν¬λμ΄μ μ΄κΈ°ν | |
tokenizer = BertTokenizerFast.from_pretrained('GiRak/beer-sentiment-bert') | |
def analyze_sentiment(sentence): | |
# λ¬Έμ₯μ ν ν¬λμ΄μ§νκ³ λͺ¨λΈ μ λ ₯μΌλ‘ λ³ν | |
inputs = tokenizer(sentence, return_tensors='pt') | |
inputs = inputs.to(device) | |
# λͺ¨λΈμ ν΅ν΄ κ°μ λΆλ₯ μν | |
outputs = model(**inputs) | |
logits = outputs.logits | |
probabilities = softmax(logits, dim=1) | |
# κ°μ λΆλ₯ νλ₯ μΆμΆ | |
sentiment_labels = ['Negative', 'Positive'] | |
sentiment_probabilities = {label: probability.item() for label, probability in zip(sentiment_labels, probabilities[0])} | |
return sentiment_probabilities | |
def sentiment_analysis(text): | |
sentiment_probabilities = analyze_sentiment(text) | |
return sentiment_probabilities | |
nlp = spacy.load('en_core_web_lg') | |
# ν€μλμ μ΄κΈ° λΉλ μλ₯Ό κ³μ° | |
def get_initial_counts(keywords): | |
initial_counts = defaultdict(int) | |
for keyword in keywords: | |
initial_counts[keyword] += 1 | |
return initial_counts | |
# λͺ μ¬ ν ν° μ μ₯νκ³ κ° μ μ¬μ±μ κ³μ°νμ¬ λͺ μ¬ μ μ¬μ± μκ³κ°μ μ΄κ³Όνλμ§ νμΈ | |
# νμ©μ¬μ μ°¨μ΄λ κ°μΈμ°¨μ μν΄ λ°μνκ³ λͺ μ¬λ νΉμ§ μ체λ₯Ό λνλ΄κΈ°μ μ§μ€ κ·Έλ£Ήν λμμ΄ λ¨ | |
def custom_similarity(doc1, doc2, noun_similarity_threshold): | |
doc1_nouns = [token for token in doc1 if token.pos_ == 'NOUN'] | |
doc2_nouns = [token for token in doc2 if token.pos_ == 'NOUN'] | |
noun_similar = any(t1.similarity(t2) > noun_similarity_threshold for t1 in doc1_nouns for t2 in doc2_nouns) | |
return noun_similar | |
# μ μ¬ν ν€μλλ€μ λ³ν©νκ³ ν€μλ λΉλ μμ 맀ν μ μ₯ | |
def merge_keywords(keyword_counts, noun_similarity_threshold): | |
merged_keywords = defaultdict(int) | |
keyword_mappings = defaultdict(list) | |
keyword_docs = {} | |
# μ΄λ―Έ λ±λ‘λ ν€μλλ nlp κ³μ°(μλ² λ©)νμ§ μκ³ λ°λ‘ λ¦¬ν΄ (Memoization) | |
def get_keyword_doc(keyword): | |
if keyword not in keyword_docs: | |
keyword_docs[keyword] = nlp(keyword) | |
return keyword_docs[keyword] | |
# μ΄κΈ° λΉλ λμ λ리λ₯Ό μννλ©° κ° ν€μλμ λν΄ μλ² λ© | |
for keyword, count in keyword_counts.items(): | |
doc1 = get_keyword_doc(keyword) | |
merged_or_found_similar = False | |
# μ΄λ―Έ λ³ν©λ ν€μλ λͺ©λ‘μμ ν΄λΉ ν€μλμ λΉκ΅ | |
for merged_keyword in list(merged_keywords): | |
# μ΄λ―Έ λ³ν©λ ν€μλμ νμ¬ ν€μλκ° ν¬ν¨λμ΄ μλ€λ©΄ λΉλλ₯Ό λν¨ | |
if keyword == merged_keyword: | |
merged_keywords[merged_keyword] += count | |
merged_or_found_similar = True | |
keyword_mappings[merged_keyword].append(keyword) | |
break | |
doc2 = get_keyword_doc(merged_keyword) | |
# μ μ¬λκ° μκ³κ°λ³΄λ€ ν° κ²½μ°, λ ν€μλλ₯Ό κ°μ κ·Έλ£ΉμΌλ‘ λ¬Άμ | |
if custom_similarity(doc1, doc2, noun_similarity_threshold): | |
merged_keywords[merged_keyword] += count | |
merged_or_found_similar = True | |
keyword_mappings[merged_keyword].append(keyword) | |
break | |
# νμ¬ ν€μλκ° κΈ°μ‘΄ λ³ν©λ ν€μλ λͺ©λ‘μ ν¬ν¨λμ§ μκ³ , μ μ¬ν ν€μλλ μλ€λ©΄ μλ‘μ΄ νλͺ©μΌλ‘ μΆκ° | |
if not merged_or_found_similar: | |
merged_keywords[keyword] = count | |
keyword_mappings[keyword] = [keyword] | |
return merged_keywords, keyword_mappings | |
def merge_similar_keywords_noun_weight(dataframe, noun_similarity_threshold=0.7): | |
# λ¬Έμμ΄λ‘ ννλ 리μ€νΈλ₯Ό μ€μ 리μ€νΈλ‘ λ³ν | |
dataframe['Keywords_List'] = dataframe['Keywords'].apply(lambda x: ast.literal_eval(x)) | |
all_keywords = dataframe['Keywords_List'].sum() | |
# λΉλ μ κ³μ° λ° λ³ν© | |
initial_counts = get_initial_counts(all_keywords) | |
filtered_and_merged_keywords, merged_keyword_mappings = merge_keywords(initial_counts, noun_similarity_threshold) | |
sorted_merged_keywords = sorted(filtered_and_merged_keywords.items(), key=lambda x: x[1], reverse=True) | |
# λ³νλ 'Keywords_List' μ»¬λΌ μμ | |
dataframe.drop('Keywords_List', axis=1, inplace=True) | |
return dict(sorted_merged_keywords), dict(merged_keyword_mappings) | |
######################################################################################################################### | |
from itertools import islice | |
beer_df = df | |
def show_keywords(beer_name, sentiment, flag=0): | |
one_beer_df = beer_df[beer_df['Beer_name'] == beer_name] | |
df = one_beer_df[one_beer_df['MultinomialNB_label'] == sentiment] | |
df.reset_index(drop=True, inplace=True) | |
keywords, mappings = merge_similar_keywords_noun_weight(df) | |
if flag == 1: | |
return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)]) | |
else: | |
return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)]), mappings | |
def keyword_mappings(beer_name, sentiment, keyword): | |
_, mappings = show_keywords(beer_name, sentiment) | |
mapped_keywords = mappings.get(keyword, "ν΄λΉ ν€μλμ 맀νλ λ¬Έμμ΄μ΄ μμ΅λλ€.") | |
return mapped_keywords | |
beer_names = list(beer_df['Beer_name'].unique()) | |
sentiments = ["Positive", "Negative"] | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
beer_name_dropdown = gr.Dropdown(choices=beer_names, label="Beer_name", info="Choose the beer you want to see the summary keyword.") | |
sentiment_dropdown = gr.Dropdown(choices=sentiments, label="Sentiment", info="Choose Positive or Negative sentiment.") | |
keyword_input = gr.Textbox(label="Keyword", info="Enter a keyword to see its mappings.") | |
with gr.Column(): | |
output_keywords = gr.Textbox(label="Summary keywords (Top 10)") | |
output_mapping = gr.Textbox(label="Keyword mapping") | |
with gr.Row(): | |
with gr.Column(): | |
review_button = gr.Button("Review Topic") | |
with gr.Column(): | |
mapping_button = gr.Button("Keyword mapping") | |
review_button.click(fn=lambda beer_name, sentiment: show_keywords(beer_name, sentiment, flag=1), | |
inputs=[beer_name_dropdown, sentiment_dropdown], | |
outputs=output_keywords) | |
mapping_button.click(keyword_mappings, | |
inputs=[beer_name_dropdown, sentiment_dropdown, keyword_input], | |
outputs=output_mapping) | |
# Tab 1 | |
app1 = gr.Interface(fn=sentiment_analysis, | |
inputs="text", | |
outputs="label", | |
live=True, | |
title = "Beer Sentiment Analysis") | |
# Tab 2 | |
app2 = demo | |
# ν 1κ³Ό 2λ₯Ό κ·Έλ£Ήν | |
tabbed = gra.TabbedInterface([app1, app2], | |
[ 'Sentiment Analysis' , 'Keyword Extraction']) | |
tabbed.launch() | |
# app1.launch() |