Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud, STOPWORDS | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.units import inch | |
| from io import BytesIO | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import chardet | |
| import os | |
| # Load model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") | |
| model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") | |
| # Function to analyze sentiment | |
| def analyze_sentiment(text): | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| scores = outputs.logits.softmax(dim=1) | |
| labels = ['NEGATIVE', 'POSITIVE'] | |
| score, label = torch.max(scores, dim=1) | |
| return {"label": labels[label.item()], "score": score.item()} | |
| # Function to detect file encoding | |
| def detect_encoding(file): | |
| rawdata = file.read() | |
| result = chardet.detect(rawdata) | |
| return result['encoding'] | |
| def generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path): | |
| pdf_output = BytesIO() | |
| pdf_height = 16.5 * inch # Total vertical height calculated | |
| pdf_width = 8.27 * inch # A4 width | |
| c = canvas.Canvas(pdf_output, pagesize=(pdf_width, pdf_height)) | |
| # Set starting vertical position | |
| y_position = pdf_height - 1 * inch | |
| # Add title | |
| c.setFont("Helvetica-Bold", 20) | |
| c.drawString(2.2 * inch, y_position, "Sentiment Analysis Report") | |
| # Update vertical position after title | |
| y_position -= 2 * inch | |
| # Add pie chart with width 5 inches and height double the width | |
| pie_chart_width = 5 * inch | |
| pie_chart_height = 4 * inch | |
| c.drawImage(pie_chart_path, 1.5 * inch, y_position - pie_chart_height, width=pie_chart_width, height=pie_chart_height) | |
| # Update vertical position after pie chart | |
| y_position -= (pie_chart_height + 1 * inch) # Add some spacing | |
| # Add Positive Keywords heading | |
| c.setFont("Helvetica-Bold", 12) | |
| c.drawString(3 * inch, y_position, "Positive Keywords") | |
| # Add positive word cloud | |
| c.drawImage(pos_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio | |
| # Update vertical position after positive word cloud | |
| y_position -= (3 * inch + 1 * inch) # Add some spacing | |
| # Add Negative Keywords heading | |
| c.setFont("Helvetica-Bold", 12) | |
| c.drawString(3 * inch, y_position, "Negative Keywords") | |
| # Add negative word cloud | |
| c.drawImage(neg_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio | |
| c.save() | |
| pdf_output.seek(0) | |
| return pdf_output | |
| # Streamlit UI | |
| st.title("Sentiment Analysis and Reporting") | |
| # Initialize session state for button visibility | |
| if 'show_pdf_download' not in st.session_state: | |
| st.session_state.show_pdf_download = False | |
| # Sidebar for encoding detection and reset button | |
| st.sidebar.header("File Encoding Checker") | |
| # File uploader in the sidebar | |
| uploaded_file = st.sidebar.file_uploader("Upload CSV file for Encoding Check", type=["csv"]) | |
| if uploaded_file: | |
| # Detect the encoding | |
| encoding = detect_encoding(uploaded_file) | |
| st.sidebar.write(f"Detected encoding: {encoding}") | |
| # Reset button in the sidebar | |
| if st.sidebar.button("Reset Analysis"): | |
| if os.path.exists("sentiment_pie_chart.png"): | |
| os.remove("sentiment_pie_chart.png") | |
| if os.path.exists("pos_wordcloud.png"): | |
| os.remove("pos_wordcloud.png") | |
| if os.path.exists("neg_wordcloud.png"): | |
| os.remove("neg_wordcloud.png") | |
| st.sidebar.write("Files deleted. Please re-upload a file to start over.") | |
| # File uploader for sentiment analysis | |
| uploaded_file = st.file_uploader("Upload CSV file for Sentiment Analysis", type=["csv"]) | |
| # Dropdown for encoding specification in the main panel | |
| encodings = ['utf-8', 'latin-1', 'ISO-8859-1', 'ASCII', 'UTF-16', 'UTF-32', 'ANSI', "Windows-1251", 'Windows-1252'] | |
| user_encoding = st.selectbox("Select Encoding", options=encodings, index=0) | |
| # Button to start processing | |
| if st.button("Go"): | |
| if uploaded_file: | |
| try: | |
| # Load the CSV file into DataFrame with specified encoding | |
| uploaded_file.seek(0) # Reset the file pointer to the beginning | |
| df = pd.read_csv(uploaded_file, encoding=user_encoding) | |
| except UnicodeDecodeError: | |
| st.error("Error decoding the file. Please specify the correct encoding.") | |
| else: | |
| # Check if the DataFrame has exactly one column | |
| if df.shape[1] != 1: | |
| st.warning("The CSV file should only contain one column with review data.") | |
| else: | |
| # Rename the column to 'review' | |
| df.columns = ['review'] | |
| # Clean up the DataFrame | |
| df['review'] = df['review'].astype(str).str.strip() | |
| df = df[df['review'].apply(len) <= 512] | |
| # Apply sentiment analysis | |
| df['sentiment'] = df['review'].apply(analyze_sentiment) | |
| df['sentiment_label'] = df['sentiment'].apply(lambda x: x['label']) | |
| df['sentiment_score'] = df['sentiment'].apply(lambda x: x['score']) | |
| # Drop the original 'sentiment' column | |
| df = df.drop(columns=['sentiment']) | |
| # Pie chart data | |
| sentiment_counts = df['sentiment_label'].value_counts() | |
| # Create pie chart | |
| fig, ax = plt.subplots() | |
| ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=45) | |
| ax.set_title('Distribution of Sentiment') | |
| pie_chart_path = "sentiment_pie_chart.png" | |
| plt.savefig(pie_chart_path) | |
| # Create word clouds | |
| stopwords = set(STOPWORDS) | |
| pos_reviews = df[df['sentiment_label'] == 'POSITIVE']['review'].str.cat(sep=' ') | |
| neg_reviews = df[df['sentiment_label'] == 'NEGATIVE']['review'].str.cat(sep=' ') | |
| pos_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(pos_reviews) | |
| neg_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(neg_reviews) | |
| # Save word clouds to files | |
| pos_wordcloud_path = "pos_wordcloud.png" | |
| neg_wordcloud_path = "neg_wordcloud.png" | |
| pos_wordcloud.to_file(pos_wordcloud_path) | |
| neg_wordcloud.to_file(neg_wordcloud_path) | |
| # Create PDF | |
| pdf_output = generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path) | |
| # Display options | |
| st.write("Processing complete!") | |
| # Update session state to show the appropriate buttons | |
| st.session_state.show_pdf_download = True | |
| # Display buttons | |
| download_pdf = st.download_button("Download PDF Report", pdf_output, file_name="sentiment_analysis_report.pdf", mime="application/pdf") | |
| else: | |
| st.info("Please upload a CSV file to get started.") | |