File size: 7,779 Bytes
b5b6102
3bd3f7f
6770282
8485d8a
e041f9c
 
 
 
 
 
0755ef3
863e5cb
3bd3f7f
83bb60d
dc101b2
863e5cb
 
 
 
 
 
d88da3d
6770282
 
 
 
 
 
 
3bd3f7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863e5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9e2c51
b552ffa
863e5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc07256
 
a87f48c
3bd3f7f
 
 
 
 
a60c2a3
863e5cb
dc07256
f0c38e0
863e5cb
 
 
df48a2b
863e5cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.nn.functional import softmax
import pandas as pd
import gradio as gra
import gradio as gr
from itertools import islice
from collections import defaultdict
import spacy
import ast
import os
from datasets import load_dataset

# df = load_dataset('csv',data_files=['pp_selected_reviews_JJ_NN.csv)
df = pd.read_csv('/home/user/app/pp_selected_reviews_JJ_NN.csv')

try:
    nlp = spacy.load("en_core_web_lg")
except:
    script = "python -m spacy download en_core_web_lg"
    os.system("bash -c '%s'" % script)

# λͺ¨λΈ λ‘œλ“œ
model = BertForSequenceClassification.from_pretrained('GiRak/beer-sentiment-bert') # HuggingFace 사전 ν•™μŠ΅ λͺ¨λΈ μ—…λ‘œλ“œ
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# ν† ν¬λ‚˜μ΄μ € μ΄ˆκΈ°ν™”
tokenizer = BertTokenizerFast.from_pretrained('GiRak/beer-sentiment-bert')

def analyze_sentiment(sentence):
    # λ¬Έμž₯을 ν† ν¬λ‚˜μ΄μ§•ν•˜κ³  λͺ¨λΈ μž…λ ₯으둜 λ³€ν™˜
    inputs = tokenizer(sentence, return_tensors='pt')
    inputs = inputs.to(device)

    # λͺ¨λΈμ„ 톡해 감정 λΆ„λ₯˜ μˆ˜ν–‰
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = softmax(logits, dim=1)

    # 감정 λΆ„λ₯˜ ν™•λ₯  μΆ”μΆœ
    sentiment_labels = ['Negative', 'Positive']
    sentiment_probabilities = {label: probability.item() for label, probability in zip(sentiment_labels, probabilities[0])}

    return sentiment_probabilities

def sentiment_analysis(text):
    sentiment_probabilities = analyze_sentiment(text)
    return sentiment_probabilities

nlp = spacy.load('en_core_web_lg')

# ν‚€μ›Œλ“œμ˜ 초기 λΉˆλ„ 수λ₯Ό 계산
def get_initial_counts(keywords):
    initial_counts = defaultdict(int)
    for keyword in keywords:
        initial_counts[keyword] += 1
    return initial_counts

# λͺ…사 토큰 μ €μž₯ν•˜κ³  각 μœ μ‚¬μ„±μ„ κ³„μ‚°ν•˜μ—¬ λͺ…사 μœ μ‚¬μ„± μž„κ³„κ°’μ„ μ΄ˆκ³Όν•˜λŠ”μ§€ 확인
# ν˜•μš©μ‚¬μ˜ μ°¨μ΄λŠ” κ°œμΈμ°¨μ— μ˜ν•΄ λ°œμƒν•˜κ³  λͺ…μ‚¬λŠ” νŠΉμ§• 자체λ₯Ό λ‚˜νƒ€λ‚΄κΈ°μ— 집쀑 κ·Έλ£Ήν•‘ λŒ€μƒμ΄ 됨
def custom_similarity(doc1, doc2, noun_similarity_threshold):
    doc1_nouns = [token for token in doc1 if token.pos_ == 'NOUN']
    doc2_nouns = [token for token in doc2 if token.pos_ == 'NOUN']
    noun_similar = any(t1.similarity(t2) > noun_similarity_threshold for t1 in doc1_nouns for t2 in doc2_nouns)
    return noun_similar

# μœ μ‚¬ν•œ ν‚€μ›Œλ“œλ“€μ„ λ³‘ν•©ν•˜κ³  ν‚€μ›Œλ“œ λΉˆλ„ μˆ˜μ™€ 맀핑 μ €μž₯
def merge_keywords(keyword_counts, noun_similarity_threshold):
    merged_keywords = defaultdict(int)
    keyword_mappings = defaultdict(list)
    keyword_docs = {}

    # 이미 λ“±λ‘λœ ν‚€μ›Œλ“œλŠ” nlp 계산(μž„λ² λ”©)ν•˜μ§€ μ•Šκ³  λ°”λ‘œ 리턴 (Memoization)
    def get_keyword_doc(keyword):
        if keyword not in keyword_docs:
            keyword_docs[keyword] = nlp(keyword)
        return keyword_docs[keyword]


    # 초기 λΉˆλ„ λ”•μ…”λ„ˆλ¦¬λ₯Ό μˆœνšŒν•˜λ©° 각 ν‚€μ›Œλ“œμ— λŒ€ν•΄ μž„λ² λ”©
    for keyword, count in keyword_counts.items():
        doc1 = get_keyword_doc(keyword)
        merged_or_found_similar = False

        # 이미 λ³‘ν•©λœ ν‚€μ›Œλ“œ λͺ©λ‘μ—μ„œ ν•΄λ‹Ή ν‚€μ›Œλ“œμ™€ 비ꡐ
        for merged_keyword in list(merged_keywords):
            # 이미 λ³‘ν•©λœ ν‚€μ›Œλ“œμ— ν˜„μž¬ ν‚€μ›Œλ“œκ°€ ν¬ν•¨λ˜μ–΄ μžˆλ‹€λ©΄ λΉˆλ„λ₯Ό 더함
            if keyword == merged_keyword:
                merged_keywords[merged_keyword] += count
                merged_or_found_similar = True
                keyword_mappings[merged_keyword].append(keyword)
                break

            doc2 = get_keyword_doc(merged_keyword)

            # μœ μ‚¬λ„κ°€ μž„κ³„κ°’λ³΄λ‹€ 큰 경우, 두 ν‚€μ›Œλ“œλ₯Ό 같은 그룹으둜 묢음
            if custom_similarity(doc1, doc2, noun_similarity_threshold):
                merged_keywords[merged_keyword] += count
                merged_or_found_similar = True
                keyword_mappings[merged_keyword].append(keyword)
                break

        # ν˜„μž¬ ν‚€μ›Œλ“œκ°€ κΈ°μ‘΄ λ³‘ν•©λœ ν‚€μ›Œλ“œ λͺ©λ‘μ— ν¬ν•¨λ˜μ§€ μ•Šκ³ , μœ μ‚¬ν•œ ν‚€μ›Œλ“œλ„ μ—†λ‹€λ©΄ μƒˆλ‘œμš΄ ν•­λͺ©μœΌλ‘œ μΆ”κ°€
        if not merged_or_found_similar:
            merged_keywords[keyword] = count
            keyword_mappings[keyword] = [keyword]

    return merged_keywords, keyword_mappings

def merge_similar_keywords_noun_weight(dataframe, noun_similarity_threshold=0.7):
    # λ¬Έμžμ—΄λ‘œ ν‘œν˜„λœ 리슀트λ₯Ό μ‹€μ œ 리슀트둜 λ³€ν™˜
    dataframe['Keywords_List'] = dataframe['Keywords'].apply(lambda x: ast.literal_eval(x))
    all_keywords = dataframe['Keywords_List'].sum()

    # λΉˆλ„ 수 계산 및 병합
    initial_counts = get_initial_counts(all_keywords)
    filtered_and_merged_keywords, merged_keyword_mappings = merge_keywords(initial_counts, noun_similarity_threshold)
    sorted_merged_keywords = sorted(filtered_and_merged_keywords.items(), key=lambda x: x[1], reverse=True)

    # λ³€ν™˜λœ 'Keywords_List' 컬럼 μ‚­μ œ
    dataframe.drop('Keywords_List', axis=1, inplace=True)

    return dict(sorted_merged_keywords), dict(merged_keyword_mappings)

#########################################################################################################################

from itertools import islice

beer_df = df

def show_keywords(beer_name, sentiment, flag=0):
    one_beer_df = beer_df[beer_df['Beer_name'] == beer_name]
    df = one_beer_df[one_beer_df['MultinomialNB_label'] == sentiment]
    df.reset_index(drop=True, inplace=True)
    keywords, mappings = merge_similar_keywords_noun_weight(df)

    if flag == 1:
        return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)])
    else:
        return "\n".join([f"{k}: {v}" for k, v in islice(keywords.items(), None, 10)]), mappings

def keyword_mappings(beer_name, sentiment, keyword):
    _, mappings = show_keywords(beer_name, sentiment)
    mapped_keywords = mappings.get(keyword, "ν•΄λ‹Ή ν‚€μ›Œλ“œμ— λ§€ν•‘λœ λ¬Έμžμ—΄μ΄ μ—†μŠ΅λ‹ˆλ‹€.")
    return mapped_keywords

beer_names = list(beer_df['Beer_name'].unique())
sentiments = ["Positive", "Negative"]


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            beer_name_dropdown = gr.Dropdown(choices=beer_names, label="Beer_name", info="Choose the beer you want to see the summary keyword.")
            sentiment_dropdown = gr.Dropdown(choices=sentiments, label="Sentiment", info="Choose Positive or Negative sentiment.")
            keyword_input = gr.Textbox(label="Keyword", info="Enter a keyword to see its mappings.")
        with gr.Column():
            output_keywords = gr.Textbox(label="Summary keywords (Top 10)")
            output_mapping = gr.Textbox(label="Keyword mapping")
    with gr.Row():
        with gr.Column():
            review_button = gr.Button("Review Topic")
        with gr.Column():
            mapping_button = gr.Button("Keyword mapping")

    review_button.click(fn=lambda beer_name, sentiment: show_keywords(beer_name, sentiment, flag=1),
                    inputs=[beer_name_dropdown, sentiment_dropdown],
                    outputs=output_keywords)
    mapping_button.click(keyword_mappings,
                        inputs=[beer_name_dropdown, sentiment_dropdown, keyword_input],
                        outputs=output_mapping)

# Tab 1
app1 = gr.Interface(fn=sentiment_analysis,
                     inputs="text",
                     outputs="label",
                     live=True,
                     title = "Beer Sentiment Analysis")

# Tab 2
app2 = demo

# νƒ­ 1κ³Ό 2λ₯Ό κ·Έλ£Ήν™”
tabbed = gra.TabbedInterface([app1, app2],
                             [ 'Sentiment Analysis' , 'Keyword Extraction'])
tabbed.launch()

# app1.launch()