Spaces:

EsoCode
/

poem_analysis

Sleeping

File size: 6,640 Bytes

996aa19

import os
import streamlit as st
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from transformers import pipeline
import nltk
import numpy as np
from utils import read_poems_from_directory, emotion_labels_with_colors

# Download nltk data for tokenization
nltk.download('punkt')

# Initialize emotion classifier pipelines
models = {
    "Model 1": pipeline('sentiment-analysis', model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True),
    "Model 2": pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-emotion", return_all_scores=True)
}

poems_directory = "./poems"
os.makedirs(poems_directory, exist_ok=True)

def analyze_poems_page():
    st.header("Analyze Poems")

    # Sidebar for file upload and listing files
    st.sidebar.title("Upload New Poem")
    uploaded_file = st.sidebar.file_uploader("Choose a text file", type="txt")

    if uploaded_file is not None:
        with open(os.path.join(poems_directory, uploaded_file.name), "wb") as f:
            f.write(uploaded_file.getbuffer())
        st.sidebar.success(f"Uploaded {uploaded_file.name}")

    st.sidebar.title("Available Poems")
    poem_files = [f for f in os.listdir(poems_directory) if f.endswith(".txt")]
    st.sidebar.write("\n".join(poem_files))

    # Sidebar input for user-specified labels
    user_labels_input = st.sidebar.text_input("Enter emotion labels (comma-separated)", 
                                              "happiness,sadness,anger,fear,disgust,surprise,love,joy,anxiety,contentment,frustration,loneliness,excitement,guilt,shame,envy,jealousy,pride,gratitude,empathy,compassion,boredom,relief,curiosity,awe,confusion,nostalgia,hope,despair,embarrassment")
    user_labels = [label.strip() for label in user_labels_input.split(",")]

    if st.button("Analyze Poems"):
        if os.path.isdir(poems_directory):
            # Read poems from the specified directory
            poems = read_poems_from_directory(poems_directory)

            if poems:
                def analyze_emotions(poem, model):
                    lines = nltk.sent_tokenize(poem)
                    emotions = []
                    for line in lines:
                        result = model(line)
                        emotions.append(result)
                    return emotions

                def process_emotions(emotions):
                    emotion_scores = []
                    all_labels = set()
                    for line_emotions in emotions:
                        line_score = {emo['label']: emo['score'] for emo in line_emotions[0]}
                        all_labels.update(line_score.keys())
                        emotion_scores.append(line_score)
                    return emotion_scores, all_labels

                def plot_emotional_arc(processed_emotions, labels, model_name):
                    st.subheader(model_name)
                    plt.figure(figsize=(15, 10))
                    for i, emotions in enumerate(processed_emotions):
                        for emotion in labels:
                            emotion_arc = [line_emotions.get(emotion, 0) for line_emotions in emotions]
                            color = emotion_labels_with_colors.get(emotion, 'black')  # default to black if not found
                            plt.plot(emotion_arc, label=f'Poem {i+1} - {emotion}', color=color)
                    plt.title(f'Emotional Arc of Each Poem ({model_name})')
                    plt.xlabel('Line Number')
                    plt.ylabel('Emotion Score')
                    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                    st.pyplot(plt)

                def extract_features(emotion_data, labels):
                    features = []
                    for emotions in emotion_data:
                        poem_features = []
                        for label in labels:
                            scores = [line_emotions.get(label, 0) for line_emotions in emotions]
                            mean_score = np.mean(scores)
                            std_score = np.std(scores)
                            poem_features.extend([mean_score, std_score])
                        features.append(poem_features)
                    return features

                # Analyze and plot for each model
                for model_name, model in models.items():
                    poem_emotions = [analyze_emotions(poem, model) for poem in poems]
                    processed_emotions = []
                    all_labels = set()
                    for emotions in poem_emotions:
                        processed, labels = process_emotions(emotions)
                        processed_emotions.append(processed)
                        all_labels.update(labels)
                    selected_labels = [label for label in user_labels if label in all_labels]
                    plot_emotional_arc(processed_emotions, selected_labels, model_name)

                    # Extract features for clustering
                    features = extract_features(processed_emotions, selected_labels)

                    # Create a DataFrame to store the features
                    columns = []
                    for label in selected_labels:
                        columns.extend([f'{label}_mean', f'{label}_std'])
                    df = pd.DataFrame(features, columns=columns)

                    # Standardize the features
                    scaler = StandardScaler()
                    scaled_features = scaler.fit_transform(df)

                    # Apply KMeans clustering
                    kmeans = KMeans(n_clusters=2, random_state=42)
                    kmeans.fit(scaled_features)
                    df['Cluster'] = kmeans.labels_

                    # Display the DataFrame
                    st.write(f"Poem Sentiment Features and Clusters ({model_name}):")
                    st.dataframe(df)

                    # Visualize the clusters
                    if not df.empty:
                        plt.figure(figsize=(8, 6))
                        plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=df['Cluster'], cmap='viridis', marker='o')
                        plt.title(f'Clusters of Poem Emotional Arcs ({model_name})')
                        plt.xlabel(f'{columns[0]}')
                        plt.ylabel(f'{columns[1]}')
                        st.pyplot(plt)
            else:
                st.warning("No text files found in the specified directory.")
        else:
            st.error("The specified path is not a valid directory.")