|
from transformers import RobertaForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
import torch |
|
import nltk |
|
import docx2txt |
|
import pandas as pd |
|
import os |
|
import matplotlib.pyplot as plt |
|
import openpyxl |
|
from openpyxl.styles import Font, Color, PatternFill |
|
from openpyxl.styles.colors import WHITE |
|
import gradio as gr |
|
|
|
nltk.download('punkt') |
|
|
|
|
|
senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment") |
|
senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False) |
|
|
|
|
|
seg_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/vi-word-segmentation") |
|
seg_model = AutoModelForTokenClassification.from_pretrained("NlpHUST/vi-word-segmentation") |
|
nlp = pipeline("token-classification", model=seg_model, tokenizer=seg_tokenizer) |
|
|
|
|
|
|
|
def segmentation(sentences): |
|
segmented_sentences = [] |
|
for sentence in sentences: |
|
ner_results = nlp(sentence) |
|
sentence_tok = "" |
|
for e in ner_results: |
|
if "##" in e["word"]: |
|
sentence_tok = sentence_tok + e["word"].replace("##", "") |
|
elif e["entity"] == "I": |
|
sentence_tok = sentence_tok + "_" + e["word"] |
|
else: |
|
sentence_tok = sentence_tok + " " + e["word"] |
|
segmented_sentences.append(sentence_tok.strip()) |
|
return segmented_sentences |
|
|
|
|
|
|
|
def read_file(docx): |
|
try: |
|
text = docx2txt.process(docx) |
|
lines = text.split('\n') |
|
lines = [line.strip() for line in lines] |
|
lines = [line for line in lines if line] |
|
return lines |
|
except Exception as e: |
|
print(f"Error reading file: {e}") |
|
|
|
|
|
|
|
def analyze(sentence): |
|
input_ids = torch.tensor([senti_tokenizer.encode(sentence)]) |
|
with torch.no_grad(): |
|
out = senti_model(input_ids) |
|
results = out.logits.softmax(dim=-1).tolist() |
|
return results[0] |
|
|
|
|
|
def file_analysis(docx): |
|
|
|
sentences = read_file(docx) |
|
segmented_sentences = segmentation(sentences) |
|
|
|
|
|
results = [] |
|
for sentence in segmented_sentences: |
|
results.append(analyze(sentence)) |
|
|
|
return results |
|
|
|
|
|
def generate_pie_chart(df): |
|
|
|
neg_avg = df['Negative'].mean() |
|
pos_avg = df['Positive'].mean() |
|
neu_avg = df['Neutral'].mean() |
|
|
|
|
|
avg_df = pd.DataFrame({'Sentiment': ['Negative', 'Positive', 'Neutral'], |
|
'Score': [neg_avg, pos_avg, neu_avg]}) |
|
|
|
|
|
colors = ['#BDBDBD', '#9ACD32', '#87CEFA'] |
|
|
|
|
|
plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%') |
|
plt.title('Average Scores by Sentiment') |
|
|
|
|
|
pie_chart_name = 'pie_chart.png' |
|
plt.savefig(pie_chart_name) |
|
plt.close() |
|
|
|
return pie_chart_name |
|
|
|
|
|
def generate_excel_file(df): |
|
|
|
wb = openpyxl.Workbook() |
|
ws = wb.active |
|
|
|
|
|
headers = ['Negative', 'Positive', 'Neutral', 'Text'] |
|
for col_num, header in enumerate(headers, 1): |
|
cell = ws.cell(row=1, column=col_num) |
|
cell.value = header |
|
cell.font = Font(bold=True) |
|
|
|
|
|
fill_dict = { |
|
'Negative': PatternFill(start_color='BDBDBD', end_color='BDBDBD', fill_type='solid'), |
|
'Positive': PatternFill(start_color='9ACD32', end_color='9ACD32', fill_type='solid'), |
|
'Neutral': PatternFill(start_color='87CEFA', end_color='87CEFA', fill_type='solid') |
|
} |
|
|
|
|
|
for row_num, row_data in df.iterrows(): |
|
|
|
sentiment_cols = ['Negative', 'Positive', 'Neutral'] |
|
scores = [row_data[col] for col in sentiment_cols] |
|
max_score = max(scores) |
|
max_index = scores.index(max_score) |
|
sentiment = sentiment_cols[max_index] |
|
|
|
|
|
for col_num, col_data in enumerate(row_data, 1): |
|
cell = ws.cell(row=row_num + 2, column=col_num) |
|
cell.value = col_data |
|
if col_num in [1, 2, 3]: |
|
if col_data == max_score: |
|
cell.fill = fill_dict[sentiment] |
|
if col_num == 4: |
|
fill = fill_dict[sentiment] |
|
font_color = WHITE if fill.start_color.rgb == 'BDBDBD' else Color('000000') |
|
cell.fill = fill |
|
cell.font = Font(color=font_color) |
|
if col_data == max_score: |
|
cell.fill = fill_dict[sentiment] |
|
|
|
|
|
excel_file_path = 'result.xlsx' |
|
wb.save(excel_file_path) |
|
|
|
return excel_file_path |
|
|
|
|
|
def process_file(docx): |
|
|
|
results = file_analysis(docx) |
|
|
|
|
|
df = pd.DataFrame(results, columns=['Negative', 'Positive', 'Neutral']) |
|
df['Text'] = read_file(docx) |
|
|
|
|
|
pie_chart_name = generate_pie_chart(df) |
|
excel_file_path = generate_excel_file(df) |
|
|
|
return pie_chart_name, excel_file_path |
|
|
|
def analyze_file(file, sentence): |
|
if file and sentence: |
|
|
|
|
|
pie_chart_name, excel_file_path = process_file(file.name) |
|
|
|
|
|
segmented_sentence = segmentation([sentence]) |
|
results = analyze(segmented_sentence[0]) |
|
|
|
|
|
label_names = ['Negative', 'Positive', 'Neutral'] |
|
|
|
|
|
output_text = "" |
|
for label, score in zip(label_names, results): |
|
score_formatted = "{:.2f}".format(score) |
|
output_text += f"{label}: {score_formatted}\n" |
|
|
|
return excel_file_path, pie_chart_name, output_text |
|
|
|
elif sentence: |
|
|
|
|
|
segmented_sentence = segmentation([sentence]) |
|
results = analyze(segmented_sentence[0]) |
|
|
|
|
|
label_names = ['Negative', 'Positive', 'Neutral'] |
|
|
|
|
|
output_text = "" |
|
for label, score in zip(label_names, results): |
|
score_formatted = "{:.2f}".format(score) |
|
output_text += f"{label}: {score_formatted}\n" |
|
|
|
return None, None, output_text |
|
elif file: |
|
|
|
|
|
pie_chart_name, excel_file_path = process_file(file.name) |
|
|
|
|
|
return excel_file_path, pie_chart_name, None |
|
|
|
inputs = [ |
|
gr.inputs.File(label="Chọn Tệp Bạn Muốn Phân Tích"), |
|
gr.inputs.Textbox(label="Nhập Văn Bản") |
|
] |
|
outputs = [ |
|
gr.outputs.File(label="Kết Quả Phân Tích Excel"), |
|
gr.outputs.Image(type="filepath", label="Thông Số Phân Tích"), |
|
gr.outputs.Textbox(label="Kết Quả Phân Tích") |
|
] |
|
|
|
interface = gr.Interface( |
|
fn=analyze_file, |
|
inputs=inputs, |
|
outputs=outputs, |
|
title="Sentiment Analysis", |
|
allow_flagging="never" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch(share=True) |
|
|