import gradio as gr import pandas as pd from transformers import pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from collections import Counter import re def analyze_demographics(file): df = pd.read_excel(file.name) results = { "Overall Metrics": {}, "Underrepresented Group Metrics": {}, "Tenure Metrics": {}, "Team Metrics": {}, "Nationality Metrics": {}, "Legal Entity Metrics": {}, "Work Location Metrics": {} } tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"] recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?" if recommend_col in df.columns: promoters = df[recommend_col].apply(lambda x: x >= 9).sum() detractors = df[recommend_col].apply(lambda x: x <= 6).sum() total_respondents = df[recommend_col].notna().sum() recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None recommend_avg = df[recommend_col].mean() results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2) results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2) support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?" if support_col in df.columns: promoters = df[support_col].apply(lambda x: x >= 9).sum() detractors = df[support_col].apply(lambda x: x <= 6).sum() total_respondents = df[support_col].notna().sum() support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None support_avg = df[support_col].mean() results["Overall Metrics"]['Support NPS'] = round(support_nps, 2) results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2) demographic_columns = [ ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"), ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"), ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"), ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"), ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"), ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics") ] for demo_col, demo_category in demographic_columns: if demo_col in df.columns: for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]: if col in df.columns: grouped_demo = df.groupby(demo_col)[col] nps_by_demo = {} for group, scores in grouped_demo: promoters = scores.apply(lambda x: x >= 9).sum() detractors = scores.apply(lambda x: x <= 6).sum() total = scores.notna().sum() nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo} results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()} else: results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()} averages_demo = grouped_demo.mean() if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)": sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo} results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()} else: results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict() return results def analyze_why_columns(file): df = pd.read_excel(file.name) # Map column names to new labels column_label_map = { "Why? (optional)": "HF NPS Why?", "Why? (optional.1)": "Support Team NPS Why?", "Why? (optional.2)": "Productivity Why?" } # Rename columns in the DataFrame df = df.rename(columns=column_label_map) # Get the renamed columns that start with "Why" why_columns = [col for col in df.columns if col in column_label_map.values()] results = {} sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") summarizer = pipeline("summarization", model="facebook/bart-large-cnn") for col in why_columns: column_data = df[col].dropna().tolist() # Sentiment Analysis with Confidence Scores sentiments = sentiment_analyzer(column_data) sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0} detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []} for response, sentiment in zip(column_data, sentiments): label = sentiment["label"] score = sentiment["score"] sentiment_summary[label] += 1 detailed_sentiments[label].append({"response": response, "score": round(score, 2)}) # Topic Modeling vectorizer = CountVectorizer(stop_words='english') X = vectorizer.fit_transform(column_data) lda = LatentDirichletAllocation(n_components=3, random_state=0) lda.fit(X) topics = [] for idx, topic in enumerate(lda.components_): top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]] topics.append(f"Topic {idx + 1}: " + ", ".join(top_words)) # Keyword Extraction combined_text = " ".join(column_data) word_list = re.findall(r"\\b\\w+\\b", combined_text.lower()) bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english') bigram_counts = bigram_vectorizer.fit_transform([combined_text]) bigram_features = bigram_vectorizer.get_feature_names_out() bigram_counts_sum = bigram_counts.toarray().sum(axis=0) bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10) keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency] # Summarization def split_text(text, max_length=1000): words = text.split() for i in range(0, len(words), max_length): yield " ".join(words[i:i + max_length]) summaries = [] for chunk in split_text(combined_text, max_length=500): summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] summaries.append(summary) final_summary = " ".join(summaries) # Store results results[col] = { "Sentiment Analysis Summary": sentiment_summary, "Detailed Sentiments": detailed_sentiments, "Topics": topics, "Keywords": keywords, "Summary": final_summary } return results def process_file(file): quantitative_results = analyze_demographics(file) qualitative_results = analyze_why_columns(file) return quantitative_results, qualitative_results def app(): file_input = gr.File(label="Upload Survey Data (Excel format)") text_output = gr.JSON(label="Quantitative Analysis Results") qualitative_output = gr.JSON(label="Qualitative Analysis Results") iface = gr.Interface( fn=process_file, inputs=file_input, outputs=[text_output, qualitative_output], title="Survey Data Analyzer", description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights." ) return iface if __name__ == "__main__": app().launch(share=True)