imBesideYou / transcrip_score.py
itsdvirani's picture
Upload 6 files
625572e verified
raw
history blame
11.9 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import google.generativeai as genai
import numpy as np
# Set up Gemini API (replace with your actual API key)
genai.configure(api_key='AIzaSyBDeJo3pioFL92ErFTtmRBmWt5diryp0E0')
def load_and_preprocess_data(file):
data = pd.read_csv(file)
numeric_columns = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']
for col in numeric_columns:
data[col] = pd.to_numeric(data[col], errors='coerce')
data = data.dropna()
return data
def calculate_transcript_stats(data, feature):
return {
'mean': data[feature].mean(),
'median': data[feature].median(),
'std': data[feature].std()
}
def calculate_feature_percentages(data):
feature_columns = ['positive', 'negative', 'neutral']
total = data[feature_columns].sum().sum()
percentages = (data[feature_columns].sum() / total * 100).round(2)
return percentages
def format_transcript_prompt(features, data_dict, selected_candidates, feature_percentages=None):
prompt = """
You are an AI assistant specializing in transcript analysis. You have access to transcript data for the following candidates: {CANDIDATES}, focusing on these features: {FEATURES}.
{STATS}
Based on this data:
1. Compare the overall levels of the specified features across the selected candidates. Which candidates exhibit more positive/negative sentiment or higher confidence?
2. Analyze the distribution of features for each selected candidate. Are they evenly spread or concentrated at certain levels?
3. Discuss any significant differences in speech patterns across the selected candidates. What might these differences suggest about their communication styles?
4. Consider the variability of features for each selected candidate. Do some candidates have more consistent levels, or do they fluctuate more?
5. Based on this transcript data, hypothesize about potential speaking styles or topics that might contribute to the observed patterns.
6. How might the differences in speech patterns between these selected candidates affect the overall listener experience?
Provide a detailed analysis addressing these points, using specific data references where relevant. Your analysis should offer insights into how these features characterize each selected candidate's communication style and what this reveals about their potential impact on the audience.
"""
stats = ""
all_features = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']
for candidate in selected_candidates:
stats += f"\n{candidate}:\n"
if feature_percentages is not None and isinstance(feature_percentages, pd.Series):
for feature in feature_percentages.index:
stats += f"{feature.capitalize()}: {feature_percentages[feature]:.2f}%\n"
else:
features_to_analyze = all_features if 'all' in features else features
for feature in features_to_analyze:
feature_stats = calculate_transcript_stats(data_dict[candidate], feature)
stats += f"{feature.capitalize()} - Mean: {feature_stats['mean']:.2f}, Median: {feature_stats['median']:.2f}, Standard Deviation: {feature_stats['std']:.2f}\n"
features_display = "all features" if 'all' in features else ", ".join(features)
return prompt.format(CANDIDATES=", ".join(selected_candidates), FEATURES=features_display, STATS=stats)
def generate_response(prompt, data_dict, features, selected_candidates, feature_percentages=None):
model = genai.GenerativeModel('gemini-pro')
analysis_prompt = format_transcript_prompt(features, data_dict, selected_candidates, feature_percentages)
full_prompt = analysis_prompt + "\n\nUser query: " + prompt
response = model.generate_content(full_prompt)
if hasattr(response, 'candidates'):
if response.candidates:
content = response.candidates[0].content
if hasattr(content, 'parts'):
for part in content.parts:
if hasattr(part, 'text'):
return part.text
return "Error: Unable to extract text from the response. Please check the API response structure."
def visualize_all_features(data_dict, selected_candidates):
feature_columns = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic',
'speech_speed']
# Correlation Matrix
fig, ax = plt.subplots(figsize=(12, 10))
combined_data = pd.concat([data_dict[candidate][feature_columns] for candidate in selected_candidates],
keys=selected_candidates)
correlation = combined_data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
ax.set_title('Correlation Matrix of Transcript Score Data')
st.pyplot(fig)
# Sentiment Distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, sentiment in enumerate(['positive', 'negative', 'neutral']):
for candidate in selected_candidates:
sns.kdeplot(data_dict[candidate][sentiment], ax=axes[i], label=candidate, shade=True)
axes[i].set_title(f'Distribution of {sentiment.capitalize()} Sentiment')
axes[i].legend()
st.pyplot(fig)
# Speech Speed Over Time
fig, ax = plt.subplots(figsize=(12, 6))
for candidate in selected_candidates:
ax.plot(data_dict[candidate]['start'], data_dict[candidate]['speech_speed'], label=candidate)
ax.set_title('Speech Speed Over Time')
ax.set_xlabel('Time')
ax.set_ylabel('Speech Speed')
ax.legend()
st.pyplot(fig)
# Speech Speed vs Confidence
fig, ax = plt.subplots(figsize=(10, 6))
for candidate in selected_candidates:
sns.scatterplot(data=data_dict[candidate], x='speech_speed', y='confident', label=candidate, ax=ax)
ax.set_title('Speech Speed vs Confidence')
ax.legend()
st.pyplot(fig)
return calculate_feature_percentages(pd.concat(data_dict.values()))
def visualize_single_candidate(data, features):
if 'all' in features:
# Correlation Matrix
fig, ax = plt.subplots(figsize=(12, 10))
correlation = data[['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
ax.set_title('Correlation Matrix of Transcript Score Data')
st.pyplot(fig)
# Sentiment Distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
sns.histplot(data['positive'], kde=True, ax=axes[0])
axes[0].set_title('Distribution of Positive Sentiment')
sns.histplot(data['negative'], kde=True, ax=axes[1])
axes[1].set_title('Distribution of Negative Sentiment')
sns.histplot(data['neutral'], kde=True, ax=axes[2])
axes[2].set_title('Distribution of Neutral Sentiment')
st.pyplot(fig)
# Speech Speed Over Time
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(data['start'], data['speech_speed'])
ax.set_title('Speech Speed Over Time')
ax.set_xlabel('Time')
ax.set_ylabel('Speech Speed')
st.pyplot(fig)
# Speech Speed vs Confidence
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=data, x='speech_speed', y='confident')
ax.set_title('Speech Speed vs Confidence')
st.pyplot(fig)
return calculate_feature_percentages(data).to_dict()
else:
# Distribution plot for specific features
fig, ax = plt.subplots(figsize=(10, 6))
for feature in features:
sns.kdeplot(data[feature], ax=ax, label=feature, shade=True)
ax.set_title(f"Distribution of Selected Features")
ax.set_xlabel("Value")
ax.set_ylabel("Density")
ax.legend()
st.pyplot(fig)
return None
def visualize_comparison(data_dict, features, selected_candidates):
if 'all' in features or set(features) == set(['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']):
return visualize_all_features(data_dict, selected_candidates)
else:
visualize_specific_features(data_dict, features, selected_candidates)
return None
def visualize_specific_features(data_dict, features, selected_candidates):
for feature in features:
fig, ax = plt.subplots(figsize=(10, 6))
for candidate in selected_candidates:
sns.kdeplot(data_dict[candidate][feature], ax=ax, label=candidate, shade=True)
ax.set_title(f"Distribution of '{feature}'")
ax.set_xlabel("Value")
ax.set_ylabel("Density")
ax.legend()
st.pyplot(fig)
def main():
st.title("Multi-Candidate Transcript Analysis Chat Interface")
num_candidates = st.number_input("How many candidates would you like to compare?", min_value=1, max_value=10,
value=1)
data_dict = {}
for i in range(num_candidates):
uploaded_file = st.file_uploader(f"Choose CSV file for Candidate {i + 1}", type="csv", key=f"candidate_{i + 1}")
if uploaded_file is not None:
data = load_and_preprocess_data(uploaded_file)
data_dict[f"Candidate {i + 1}"] = data
if len(data_dict) == num_candidates:
st.success("All files uploaded successfully. You can now start chatting!")
st.subheader("Data Information")
for candidate, data in data_dict.items():
st.write(f"{candidate} Columns:", data.columns.tolist())
features = ['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic',
'speech_speed']
st.write("Available Features:", features)
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Candidate selection for each query
selected_candidates = st.multiselect("Select candidates to compare:", list(data_dict.keys()))
if not selected_candidates:
selected_candidates = list(data_dict.keys())
if prompt := st.chat_input("What would you like to know about the candidates' transcripts?"):
st.chat_message("user").markdown(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
selected_features = []
if any(keyword in prompt.lower() for keyword in ["all features", "all transcript", "compare all"]):
selected_features = ['all']
else:
for feature in features:
if feature in prompt.lower():
selected_features.append(feature)
if not selected_features:
selected_features = ['all']
feature_percentages = None
with st.chat_message("assistant"):
feature_percentages = visualize_comparison(data_dict, selected_features, selected_candidates)
response = generate_response(prompt, data_dict, selected_features, selected_candidates, feature_percentages)
with st.chat_message("assistant"):
st.markdown(response)
st.session_state.messages.append({"role": "assistant", "content": response})
if st.checkbox("Show raw data"):
for candidate in selected_candidates:
st.subheader(f"{candidate} Data")
st.write(data_dict[candidate])
if __name__ == "__main__":
main()