Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline, AutoTokenizer | |
from classifier import MistralForSequenceClassification | |
import torch | |
import nltk | |
import json | |
import pandas as pd | |
import plotly.graph_objects as go | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import io | |
import base64 | |
from PIL import Image | |
from nltk import bigrams | |
import malaya | |
from collections import Counter | |
HF_TOKEN = os.getenv('hf_token') | |
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN,'aisyahhrazak/tpb-crowdsourced-dataset') | |
with open('en.json') as fopen: | |
en = json.load(fopen) | |
stopwords = malaya.text.function.get_stopwords() | |
stopwords = stopwords + en + ['lor', 'quote','Quote','QUOTE','pm', 'long', 'jer', 'time', 'feel', 'liao', 'wow', 'https', 'http', 've', 'ko', 'kena', 'post', 'ni', 'tu', 'don', 'je', 'jeh', 'la', 'tau', 'haha', 'hahaha', 'hahahaha'] | |
stopwords += ['for me', 'to be', 'in the', 'me to', 'for me to'] | |
nltk.download('punkt', quiet=True) | |
nltk.download('punkt_tab', quiet=True) | |
nltk.download('stopwords', quiet=True) | |
nltk.download('vader_lexicon', quiet=True) | |
tokenizer_tpb = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512') | |
model_tpb = MistralForSequenceClassification.from_pretrained('HalalFoodNLP/tpb-model-halal', torch_dtype=torch.bfloat16) | |
model_sentiment = MistralForSequenceClassification.from_pretrained('malaysia-ai/sentiment-mistral-191M-MLM', torch_dtype=torch.bfloat16) | |
pipeline_tpb = pipeline(task="text-classification", model=model_tpb, tokenizer=tokenizer_tpb) | |
sentiment_pipeline = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_tpb) | |
data = [] | |
with open('sentiment-tpb-dataset.jsonl', 'r') as file: | |
for line in file: | |
data.append(json.loads(line)) | |
df = pd.DataFrame(data) | |
# Update the generate_wordcloud function to return a PIL Image object | |
def generate_wordcloud(text): | |
# Generate the word cloud | |
wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text) | |
# Create the plot | |
plt.figure(figsize=(10, 5)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.tight_layout(pad=0) | |
# Save the plot to a bytes buffer | |
buf = io.BytesIO() | |
plt.savefig(buf, format='png') | |
plt.close() | |
buf.seek(0) | |
# Convert bytes buffer to PIL Image | |
image = Image.open(buf) | |
return image | |
# Add a function to generate bigrams | |
def generate_bigrams(text): | |
words = nltk.word_tokenize(text.lower()) | |
words = [word for word in words if word.isalnum() and word not in stopwords] | |
bi_grams = list(bigrams(words)) | |
return Counter(bi_grams).most_common(10) | |
def predict_decision(sentiment_label): | |
if sentiment_label == 'positive': | |
return "High likelihood of purchase" | |
elif sentiment_label == 'neutral': | |
return "Moderate likelihood of purchase" | |
else: | |
return "Low likelihood of purchase" | |
# Function to generate report based on TPB sentiment | |
def generate_report(tpb_sentiment_df): | |
report = "## TPB Factor Analysis and Recommendations Report\n\n" | |
for _, row in tpb_sentiment_df.iterrows(): | |
tpb_label = row['tpb_label'] | |
positive_percentage = row['positive'] | |
negative_percentage = row['negative'] | |
if negative_percentage > 70: # Only generate recommendations for positive < 70% | |
if tpb_label == "attitude": | |
report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n" | |
report += """ | |
**Current Issues:** | |
- High negative perception regarding product quality | |
- Concerns about halal certification and its authenticity | |
- Pricing issues in comparison to perceived value | |
**Recommended Actions:** | |
1. **Quality Control Improvements** | |
- Implement enhanced product quality measures | |
- Obtain globally recognized halal certifications | |
- Conduct regular quality audits | |
2. **Educational Campaigns** | |
- Educate customers on halal certification processes | |
- Raise awareness about the health benefits of halal products | |
- Highlight ethical and sustainable sourcing | |
3. **Pricing Strategy Adjustment** | |
- Reassess pricing to align with customer expectations | |
- Introduce discount programs or loyalty initiatives | |
""" | |
if tpb_label == "religious knowledge": | |
report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n" | |
report += """ | |
**Current Issues:** | |
- Lack of awareness and understanding about the halal process | |
- Customers may be unsure of the religious guidelines followed | |
**Recommended Actions:** | |
1. **Religious Knowledge Enhancement** | |
- Provide clear educational materials on the halal process | |
- Collaborate with religious scholars to endorse products | |
- Ensure transparent labeling and certification | |
2. **Community Engagement** | |
- Host webinars or community events about halal | |
- Partner with local religious organizations for outreach | |
- Share customer testimonials emphasizing trust in your certification | |
""" | |
if tpb_label == "subjective norms": | |
report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n" | |
report += """ | |
**Current Issues:** | |
- Social influence or peer pressure regarding halal compliance is weak | |
- Lack of community-driven recommendations for the product | |
**Recommended Actions:** | |
1. **Influence Social Circles** | |
- Engage community leaders or influencers to endorse products | |
- Create social campaigns around the halal certification to enhance peer recommendations | |
2. **Referral Programs** | |
- Introduce referral programs where existing customers can promote the product | |
- Offer incentives for customers who share their experiences with others | |
3. **Testimonials and Success Stories** | |
- Use customer testimonials and success stories to strengthen social trust | |
""" | |
if tpb_label == "perceived behavioural control": | |
report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n" | |
report += """ | |
**Current Issues:** | |
- Perceived difficulty in understanding or accessing halal-certified products | |
- Concerns about control over product quality and sourcing transparency | |
**Recommended Actions:** | |
1. **Improve Accessibility** | |
- Make halal products more accessible through multiple platforms (e-commerce, retail stores) | |
- Ensure ease of purchase and fast delivery options | |
2. **Enhance Transparency** | |
- Provide detailed information about sourcing and production processes | |
- Use blockchain or similar technology to enhance transparency in halal certification | |
3. **Customer Empowerment** | |
- Offer customer feedback channels to empower users to voice concerns and suggestions | |
- Ensure that concerns are addressed promptly to build trust and satisfaction | |
""" | |
return report | |
def search_company(keyword): | |
if not keyword: | |
return None, None, None, None | |
filtered_df = df[df['text'].str.contains(keyword, case=False)] | |
if filtered_df.empty: | |
return None, None, None, None | |
# Calculate sentiment distribution | |
sentiment_counts = filtered_df['label'].value_counts(normalize=True) * 100 | |
colors = ['red' if sentiment == 'negative' else 'gray' if sentiment == 'neutral' else 'blue' for sentiment in sentiment_counts.index] | |
# Create the bar plot | |
sentiment_fig = go.Figure(data=[go.Bar( | |
x=sentiment_counts.index, | |
y=sentiment_counts.values, | |
text=[f'{val:.1f}%' for val in sentiment_counts.values], | |
textposition='auto', | |
marker_color=colors | |
)]) | |
sentiment_fig.update_layout( | |
title='Overall Sentiment Distribution', | |
xaxis_title='Sentiment', | |
yaxis_title='Percentage' | |
) | |
tpb_counts = filtered_df['tpb_label'].value_counts(normalize=True) * 100 | |
tpb_fig = go.Figure(data=[go.Bar( | |
x=tpb_counts.index, | |
y=tpb_counts.values, | |
text=[f'{val:.1f}%' for val in tpb_counts.values], | |
textposition='auto' | |
)]) | |
tpb_fig.update_layout(title='Overall TPB Factor Distribution', xaxis_title='TPB Factor', yaxis_title='Percentage') | |
# Calculate sentiment distribution within each TPB factor | |
tpb_sentiment_df = filtered_df.groupby(['tpb_label', 'label']).size().unstack(fill_value=0) | |
tpb_sentiment_df = tpb_sentiment_df.div(tpb_sentiment_df.sum(axis=1), axis=0) * 100 | |
# Define colors for each sentiment | |
color_map = { | |
'negative': 'red', | |
'neutral': 'gray', | |
'positive': 'blue' | |
} | |
tpb_sentiment_fig = go.Figure() | |
for sentiment in tpb_sentiment_df.columns: | |
tpb_sentiment_fig.add_trace(go.Bar( | |
name=sentiment, | |
x=tpb_sentiment_df.index, | |
y=tpb_sentiment_df[sentiment], | |
text=[f'{val:.1f}%' for val in tpb_sentiment_df[sentiment]], | |
textposition='auto', | |
marker_color=color_map.get(sentiment, 'gray') | |
)) | |
tpb_sentiment_fig.update_layout( | |
barmode='stack', | |
title='Sentiment Distribution within TPB Factors', | |
xaxis_title='TPB Factor', | |
yaxis_title='Percentage' | |
) | |
report = generate_report(tpb_sentiment_df.reset_index()) | |
wordclouds = {} | |
bigrams_data = {} | |
for label in filtered_df['tpb_label'].unique(): | |
text = ' '.join(filtered_df[filtered_df['tpb_label'] == label]['text']).replace('QUOTE','').replace('quote','').replace('sijil halal','').replace('halal','') | |
wordclouds[label] = generate_wordcloud(text) | |
bigrams_data[label] = generate_bigrams(text) | |
# Extract only the words | |
words_only = { | |
key: [word_pair for word_pair, _ in value] | |
for key, value in bigrams_data.items() | |
} | |
# Create a single DataFrame for bigrams, with only the bigram text (no frequency) | |
bigram_df = pd.DataFrame({ | |
label: data for label, data in words_only.items() | |
}) | |
print(bigrams_data.items()) | |
bigram_df.index = [f"Top {i+1}" for i in range(len(bigram_df))] | |
return (sentiment_fig, tpb_fig, tpb_sentiment_fig, filtered_df[filtered_df['text'].str.len() < 300].head(5), | |
report, wordclouds.get('attitude'), wordclouds.get('religious knowledge'), | |
wordclouds.get('subjective norms'), wordclouds.get('perceived behavioural control'),bigram_df) | |
def text_classification_and_sentiment(text, keywords_df): | |
result_tpb = pipeline_tpb(text) | |
tpb_label = result_tpb[0]['label'] | |
tpb_score = result_tpb[0]['score'] | |
result_sentiment = sentiment_pipeline(text) | |
sentiment_label = result_sentiment[0]['label'] | |
sentiment_score = result_sentiment[0]['score'] | |
keywords_df = pd.read_excel('IMG_8137.xlsx') | |
# Check for keywords in the first column of the DataFrame | |
keywords = keywords_df.iloc[:, 0].tolist() | |
for keyword in keywords: | |
if keyword.lower() in text.lower(): | |
sentiment_label = 'negative' | |
sentiment_score = 1.0 | |
decision = predict_decision(sentiment_label) | |
tpb_output = f"TPB Label: {tpb_label}" | |
sentiment_output = f"Sentiment: {sentiment_label}\nProbability: {sentiment_score * 100:.2f}%" | |
decision_output = f"Decision: {decision}" | |
return tpb_output, sentiment_output, decision_output | |
examples = [ | |
"Alhamdulillah, hari ni dapat makan dekat restoran halal baru. Rasa puas hati dan tenang bila tau makanan yang kita makan dijamin halal.", | |
"Semua orang cakap kena check logo halal sebelum beli makanan. Dah jadi macam second nature dah sekarang. Korang pun sama kan?" | |
] | |
css = """ | |
:root { | |
--bg: #FFFFFF; /* Set the background color to white */ | |
--col: #191919; /* Define primary text color */ | |
--bg-dark: #000000; /* Define dark background color if needed */ | |
--col-dark: #ECF2F7; /* Define dark text color if needed */ | |
----body-background-fill: #FFFFFF; | |
} | |
html, body { | |
background-color: var(--bg); /* Set the background color to white for the entire page */ | |
margin: 0; /* Remove default body margin */ | |
padding: 0; /* Remove default body padding */ | |
} | |
.container { | |
max-width: 1000px; | |
margin: auto; | |
padding: 20px; | |
} | |
.title { | |
text-align: center; | |
margin-bottom: 20px; | |
} | |
.nav-buttons { | |
display: flex; | |
justify-content: center; | |
gap: 10px; | |
margin-bottom: 20px; | |
} | |
#recommendation_report { | |
background-color: #f9f9f9; /* Keep this background light for the report section */ | |
padding: 20px; | |
border: 2px solid #e0e0e0; | |
border-radius: 10px; | |
margin-top: 20px; | |
font-family: Arial, sans-serif; | |
font-size: 14px; | |
} | |
.wrap-text { | |
white-space: normal !important; | |
word-wrap: break-word; | |
} | |
.footer {visibility: hidden} | |
""" | |
with gr.Blocks(css=css + """ | |
body, .gradio-container, .root, .wrap, #root .background .container { | |
background-color: white !important; | |
background-image: none !important; | |
background-fill: white !important; | |
} | |
""", theme='aisyahhrazak/miku-aisyah@=1.2.2') as demo: | |
with gr.Tabs() as tabs: | |
with gr.TabItem("User View", id=0): | |
gr.Markdown("## Text Classification and Sentiment Analysis Based on User Input About Halal Food Acquisition") | |
gr.Markdown("Enter a text to see TPB classification, sentiment analysis, and purchase prediction results!") | |
input_text = gr.Textbox(lines=2, label="Input Comment", placeholder="Model can make mistakes, we are striving to improve the model.") | |
with gr.Row(): | |
tpb_output = gr.Textbox(lines=3, label="TPB Classification") | |
sentiment_output = gr.Textbox(lines=3, label="Sentiment Analysis") | |
decision_output = gr.Textbox(lines=3, label="Purchase Prediction") | |
# This needs to be called at some point prior to the first call to callback.flag() | |
hf_writer.setup([input_text,tpb_output, sentiment_output], "flagged_data_points") | |
classify_button = gr.Button("Analyze") | |
classify_button.click(lambda *args: hf_writer.flag(list(args)),fn=text_classification_and_sentiment, inputs=input_text, outputs=[tpb_output, sentiment_output, decision_output]) | |
gr.Examples(examples=examples, inputs=input_text) | |
with gr.TabItem("Company View", id=1): | |
gr.Markdown("# Sentiment Analysis and Purchase Decision Factor for Halal Food Acquisition") | |
input_text = gr.Textbox(lines=1, label="Search Keyword", placeholder="Enter keyword") | |
search_button = gr.Button("Search") | |
with gr.Row(): | |
sentiment_chart = gr.Plot(label="Sentiment Distribution") | |
tpb_chart = gr.Plot(label="TPB Factor Distribution") | |
tpb_sentiment_chart = gr.Plot(label="Sentiment Distribution within TPB Factors") | |
# Update word cloud outputs to be in a single row | |
gr.Markdown("### Word Clouds by TPB Label") | |
with gr.Row(): | |
attitude_wc = gr.Image(label="Attitude Word Cloud", height=200, width=300) | |
religious_knowledge_wc = gr.Image(label="Religious Knowledge Word Cloud", height=200, width=300) | |
subjective_norms_wc = gr.Image(label="Subjective Norms Word Cloud",height=200, width=300) | |
perceived_behavioural_control_wc = gr.Image(label="Perceived Behavioural Control Word Cloud", height=200, width=300) | |
with gr.Accordion("See Recommendation Details"): | |
report_output = gr.Markdown(label="Recommendation Report", elem_id="recommendation_report") | |
gr.Markdown("### Top Bigrams by TPB Label") | |
bigram_table = gr.Dataframe(label="Top Bigrams for Each TPB Label") | |
output_table = gr.Dataframe( | |
headers=["text", "tpb_label", "sentiment", "score"], | |
label="Company Analysis Results", | |
wrap=True | |
) | |
search_button.click( | |
fn=search_company, | |
inputs=input_text, | |
outputs=[ | |
sentiment_chart, tpb_chart, tpb_sentiment_chart, output_table, report_output, | |
attitude_wc, religious_knowledge_wc, subjective_norms_wc, perceived_behavioural_control_wc,bigram_table | |
] | |
) | |
demo.launch() |