GiRak's picture
Update app.py
dc101b2
raw
history blame contribute delete
No virus
7.78 kB
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.nn.functional import softmax
import pandas as pd
import gradio as gra
import gradio as gr
from itertools import islice
from collections import defaultdict
import spacy
import ast
import os
from datasets import load_dataset
# df = load_dataset('csv',data_files=['pp_selected_reviews_JJ_NN.csv)
df = pd.read_csv('/home/user/app/pp_selected_reviews_JJ_NN.csv')
try:
nlp = spacy.load("en_core_web_lg")
except:
script = "python -m spacy download en_core_web_lg"
os.system("bash -c '%s'" % script)
# λͺ¨λΈ λ‘œλ“œ
model = BertForSequenceClassification.from_pretrained('GiRak/beer-sentiment-bert') # HuggingFace 사전 ν•™μŠ΅ λͺ¨λΈ μ—…λ‘œλ“œ
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
# ν† ν¬λ‚˜μ΄μ € μ΄ˆκΈ°ν™”
tokenizer = BertTokenizerFast.from_pretrained('GiRak/beer-sentiment-bert')
def analyze_sentiment(sentence):
# λ¬Έμž₯을 ν† ν¬λ‚˜μ΄μ§•ν•˜κ³  λͺ¨λΈ μž…λ ₯으둜 λ³€ν™˜
inputs = tokenizer(sentence, return_tensors='pt')
inputs = inputs.to(device)
# λͺ¨λΈμ„ 톡해 감정 λΆ„λ₯˜ μˆ˜ν–‰
outputs = model(**inputs)
logits = outputs.logits
probabilities = softmax(logits, dim=1)
# 감정 λΆ„λ₯˜ ν™•λ₯  μΆ”μΆœ
sentiment_labels = ['Negative', 'Positive']
sentiment_probabilities = {label: probability.item() for label, probability in zip(sentiment_labels, probabilities[0])}
return sentiment_probabilities
def sentiment_analysis(text):
sentiment_probabilities = analyze_sentiment(text)
return sentiment_probabilities
nlp = spacy.load('en_core_web_lg')
# ν‚€μ›Œλ“œμ˜ 초기 λΉˆλ„ 수λ₯Ό 계산
def get_initial_counts(keywords):
initial_counts = defaultdict(int)
for keyword in keywords:
initial_counts[keyword] += 1
return initial_counts
# λͺ…사 토큰 μ €μž₯ν•˜κ³  각 μœ μ‚¬μ„±μ„ κ³„μ‚°ν•˜μ—¬ λͺ…사 μœ μ‚¬μ„± μž„κ³„κ°’μ„ μ΄ˆκ³Όν•˜λŠ”μ§€ 확인
# ν˜•μš©μ‚¬μ˜ μ°¨μ΄λŠ” κ°œμΈμ°¨μ— μ˜ν•΄ λ°œμƒν•˜κ³  λͺ…μ‚¬λŠ” νŠΉμ§• 자체λ₯Ό λ‚˜νƒ€λ‚΄κΈ°μ— 집쀑 κ·Έλ£Ήν•‘ λŒ€μƒμ΄ 됨
def custom_similarity(doc1, doc2, noun_similarity_threshold):
doc1_nouns = [token for token in doc1 if token.pos_ == 'NOUN']
doc2_nouns = [token for token in doc2 if token.pos_ == 'NOUN']
noun_similar = any(t1.similarity(t2) > noun_similarity_threshold for t1 in doc1_nouns for t2 in doc2_nouns)
return noun_similar
# μœ μ‚¬ν•œ ν‚€μ›Œλ“œλ“€μ„ λ³‘ν•©ν•˜κ³  ν‚€μ›Œλ“œ λΉˆλ„ μˆ˜μ™€ 맀핑 μ €μž₯
def merge_keywords(keyword_counts, noun_similarity_threshold):
merged_keywords = defaultdict(int)
keyword_mappings = defaultdict(list)
keyword_docs = {}
# 이미 λ“±λ‘λœ ν‚€μ›Œλ“œλŠ” nlp 계산(μž„λ² λ”©)ν•˜μ§€ μ•Šκ³  λ°”λ‘œ 리턴 (Memoization)
def get_keyword_doc(keyword):
if keyword not in keyword_docs:
keyword_docs[keyword] = nlp(keyword)
return keyword_docs[keyword]
# 초기 λΉˆλ„ λ”•μ…”λ„ˆλ¦¬λ₯Ό μˆœνšŒν•˜λ©° 각 ν‚€μ›Œλ“œμ— λŒ€ν•΄ μž„λ² λ”©
for keyword, count in keyword_counts.items():
doc1 = get_keyword_doc(keyword)
merged_or_found_similar = False
# 이미 λ³‘ν•©λœ ν‚€μ›Œλ“œ λͺ©λ‘μ—μ„œ ν•΄λ‹Ή ν‚€μ›Œλ“œμ™€ 비ꡐ
for merged_keyword in list(merged_keywords):
# 이미 λ³‘ν•©λœ ν‚€μ›Œλ“œμ— ν˜„μž¬ ν‚€μ›Œλ“œκ°€ ν¬ν•¨λ˜μ–΄ μžˆλ‹€λ©΄ λΉˆλ„λ₯Ό 더함
if keyword == merged_keyword:
merged_keywords[merged_keyword] += count
merged_or_found_similar = True
keyword_mappings[merged_keyword].append(keyword)
break
doc2 = get_keyword_doc(merged_keyword)
# μœ μ‚¬λ„κ°€ μž„κ³„κ°’λ³΄λ‹€ 큰 경우, 두 ν‚€μ›Œλ“œλ₯Ό 같은 그룹으둜 묢음
if custom_similarity(doc1, doc2, noun_similarity_threshold):
merged_keywords[merged_keyword] += count
merged_or_found_similar = True
keyword_mappings[merged_keyword].append(keyword)
break
# ν˜„μž¬ ν‚€μ›Œλ“œκ°€ κΈ°μ‘΄ λ³‘ν•©λœ ν‚€μ›Œλ“œ λͺ©λ‘μ— ν¬ν•¨λ˜μ§€ μ•Šκ³ , μœ μ‚¬ν•œ ν‚€μ›Œλ“œλ„ μ—†λ‹€λ©΄ μƒˆλ‘œμš΄ ν•­λͺ©μœΌλ‘œ μΆ”κ°€
if not merged_or_found_similar:
merged_keywords[keyword] = count
keyword_mappings[keyword] = [keyword]
return merged_keywords, keyword_mappings
def merge_similar_keywords_noun_weight(dataframe, noun_similarity_threshold=0.7):
# λ¬Έμžμ—΄λ‘œ ν‘œν˜„λœ 리슀트λ₯Ό μ‹€μ œ 리슀트둜 λ³€ν™˜
dataframe['Keywords_List'] = dataframe['Keywords'].apply(lambda x: ast.literal_eval(x))
all_keywords = dataframe['Keywords_List'].sum()
# λΉˆλ„ 수 계산 및 병합
initial_counts = get_initial_counts(all_keywords)
filtered_and_merged_keywords, merged_keyword_mappings = merge_keywords(initial_counts, noun_similarity_threshold)
sorted_merged_keywords = sorted(filtered_and_merged_keywords.items(), key=lambda x: x[1], reverse=True)
# λ³€ν™˜λœ 'Keywords_List' 컬럼 μ‚­μ œ
dataframe.drop('Keywords_List', axis=1, inplace=True)
return dict(sorted_merged_keywords), dict(merged_keyword_mappings)
#########################################################################################################################
from itertools import islice
beer_df = df
def show_keywords(beer_name, sentiment, flag=0):
one_beer_df = beer_df[beer_df['Beer_name'] == beer_name]
df = one_beer_df[one_beer_df['MultinomialNB_label'] == sentiment]
df.reset_index(drop=True, inplace=True)
keywords, mappings = merge_similar_keywords_noun_weight(df)
if flag == 1:
return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)])
else:
return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)]), mappings
def keyword_mappings(beer_name, sentiment, keyword):
_, mappings = show_keywords(beer_name, sentiment)
mapped_keywords = mappings.get(keyword, "ν•΄λ‹Ή ν‚€μ›Œλ“œμ— λ§€ν•‘λœ λ¬Έμžμ—΄μ΄ μ—†μŠ΅λ‹ˆλ‹€.")
return mapped_keywords
beer_names = list(beer_df['Beer_name'].unique())
sentiments = ["Positive", "Negative"]
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
beer_name_dropdown = gr.Dropdown(choices=beer_names, label="Beer_name", info="Choose the beer you want to see the summary keyword.")
sentiment_dropdown = gr.Dropdown(choices=sentiments, label="Sentiment", info="Choose Positive or Negative sentiment.")
keyword_input = gr.Textbox(label="Keyword", info="Enter a keyword to see its mappings.")
with gr.Column():
output_keywords = gr.Textbox(label="Summary keywords (Top 10)")
output_mapping = gr.Textbox(label="Keyword mapping")
with gr.Row():
with gr.Column():
review_button = gr.Button("Review Topic")
with gr.Column():
mapping_button = gr.Button("Keyword mapping")
review_button.click(fn=lambda beer_name, sentiment: show_keywords(beer_name, sentiment, flag=1),
inputs=[beer_name_dropdown, sentiment_dropdown],
outputs=output_keywords)
mapping_button.click(keyword_mappings,
inputs=[beer_name_dropdown, sentiment_dropdown, keyword_input],
outputs=output_mapping)
# Tab 1
app1 = gr.Interface(fn=sentiment_analysis,
inputs="text",
outputs="label",
live=True,
title = "Beer Sentiment Analysis")
# Tab 2
app2 = demo
# νƒ­ 1κ³Ό 2λ₯Ό κ·Έλ£Ήν™”
tabbed = gra.TabbedInterface([app1, app2],
[ 'Sentiment Analysis' , 'Keyword Extraction'])
tabbed.launch()
# app1.launch()