fcas / visualisations.py
lsempe's picture
Clean repo, remove binary history
9c062cd
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import numpy as np
def create_world_map(docs_df):
"""Create interactive world map showing study distribution for conflict-affected countries only"""
if docs_df.empty or 'study_countries' not in docs_df.columns:
print("No data or missing 'study_countries' column")
return None
# Define the specific countries we want to show with their study counts
target_countries = {
# Nationwide conflict
'Burkina Faso': 1098,
'Afghanistan': 697,
'Mali': 496,
'Sudan': 470,
'Haiti': 394,
'Somalia': 373,
'Niger': 352,
'Syria': 323,
'South Sudan': 294,
'Libya': 119,
'Palestinian Territories': 81,
'Central African Republic': 72,
# Partial conflict
'Iraq': 128,
'Nigeria': 121,
'Lebanon': 102,
'Ethiopia': 81,
'Democratic Republic of the Congo': 71,
'Cameroon': 54,
'Chad': 36,
'Mozambique': 30,
'Myanmar': 11
}
# Count actual studies in our dataset for these countries
country_counts = Counter()
for countries_str in docs_df['study_countries'].dropna():
if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
continue
countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
for country in countries:
if country in target_countries:
country_counts[country] += 1
# Use target countries with actual counts where available
map_data = []
for country, target_count in target_countries.items():
actual_count = country_counts.get(country, 0)
conflict_type = "Nationwide" if target_count > 400 else "Partial"
map_data.append({
'country': country,
'actual_studies': actual_count,
'target_studies': target_count,
'conflict_type': conflict_type
})
map_df = pd.DataFrame(map_data)
print(f"Mapping {len(map_df)} conflict-affected countries")
print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
# Create choropleth map using target study counts
fig = go.Figure(data=go.Choropleth(
locations=map_df['country'],
z=map_df['target_studies'],
locationmode='country names',
colorscale='Reds',
hovertemplate='<b>%{location}</b><br>' +
'Studies (Target): %{z}<br>' +
'Studies (In Dataset): %{customdata}<br>' +
'<extra></extra>',
customdata=map_df['actual_studies'],
colorbar_title="Number of Studies"
))
fig.update_layout(
title={
'text': 'Research Coverage: Conflict-Affected Countries',
'x': 0.5,
'xanchor': 'center',
'font': {'size': 18}
},
geo=dict(
showframe=False,
showcoastlines=True,
projection_type='natural earth'
),
height=600,
width=1000
)
fig.show()
return fig
def create_interactive_data_explorer(docs_df):
"""Create an interactive data explorer for methodology analysis"""
if docs_df.empty:
print("No data available")
return None
print("=== DATASET OVERVIEW ===")
print(f"Total studies: {len(docs_df)}")
print(f"Columns available: {len(docs_df.columns)}")
# Key numeric columns for analysis
numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
categorical_cols = [
'world_bank_sector', 'research_design', 'data_collection_method',
'analysis_type', 'study_countries', 'population', 'author_income_group',
'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
]
# Filter to existing columns
available_numeric = [col for col in numeric_cols if col in docs_df.columns]
available_categorical = [col for col in categorical_cols if col in docs_df.columns]
print(f"Numeric variables: {available_numeric}")
print(f"Categorical variables: {available_categorical}")
# Create summary statistics table
summary_data = []
# Numeric summaries
for col in available_numeric:
values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
if len(values) > 0:
summary_data.append({
'Variable': col,
'Type': 'Numeric',
'Valid_Values': len(values),
'Missing': len(docs_df) - len(values),
'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
})
# Categorical summaries
for col in available_categorical:
values = docs_df[col].dropna()
if len(values) > 0:
unique_count = values.nunique()
top_category = values.value_counts().index[0] if len(values) > 0 else "None"
summary_data.append({
'Variable': col,
'Type': 'Categorical',
'Valid_Values': len(values),
'Missing': len(docs_df) - len(values),
'Summary': f"{unique_count} categories, Top: {top_category}"
})
summary_df = pd.DataFrame(summary_data)
# Create visualization showing data completeness
fig = go.Figure()
# Data completeness bar chart
fig.add_trace(go.Bar(
x=summary_df['Variable'],
y=summary_df['Valid_Values'],
name='Valid Values',
marker_color='steelblue',
hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
customdata=summary_df['Summary']
))
fig.add_trace(go.Bar(
x=summary_df['Variable'],
y=summary_df['Missing'],
name='Missing Values',
marker_color='lightcoral'
))
fig.update_layout(
title='Data Completeness by Variable',
xaxis_title='Variables',
yaxis_title='Number of Records',
barmode='stack',
height=500,
xaxis={'tickangle': 45}
)
fig.show()
# Print summary table
print("\n=== VARIABLE SUMMARY ===")
for _, row in summary_df.iterrows():
print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
return fig, summary_df
def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
"""Create a pivot table analysis with visualization"""
if docs_df.empty:
return None
if row_var not in docs_df.columns or col_var not in docs_df.columns:
print(f"Variables not found. Available: {list(docs_df.columns)}")
return None
try:
if value_var and value_var in docs_df.columns:
# Numeric aggregation
pivot_df = docs_df.pivot_table(
index=row_var,
columns=col_var,
values=value_var,
aggfunc=agg_func,
fill_value=0
)
title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
else:
# Count aggregation
pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
title = f"Study Count by {row_var} and {col_var}"
# Create heatmap
fig = px.imshow(
pivot_df.values,
x=pivot_df.columns,
y=pivot_df.index,
color_continuous_scale='Viridis',
title=title
)
fig.update_layout(
height=max(400, len(pivot_df.index) * 30),
width=max(600, len(pivot_df.columns) * 50)
)
fig.show()
print(f"\nPivot Table: {row_var} × {col_var}")
print(pivot_df.head(10))
return fig, pivot_df
except Exception as e:
print(f"Error creating pivot: {e}")
return None
# Example usage functions
def explore_methodology_patterns(docs_df):
"""Explore common methodology patterns"""
if docs_df.empty:
return None
# Research design by sector
if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
print("=== RESEARCH DESIGN BY SECTOR ===")
return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')
def explore_data_collection(docs_df):
"""Explore data collection patterns"""
if docs_df.empty:
return None
# Data collection by country income group
if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')
def filter_and_analyze(docs_df, **filters):
"""Filter data and run analysis on the subset"""
if docs_df.empty:
print("No data available")
return None
filtered = docs_df.copy()
filter_summary = []
# Apply filters
if 'countries' in filters and filters['countries']:
countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
filtered = filtered[country_mask]
filter_summary.append(f"Countries: {', '.join(countries)}")
if 'sectors' in filters and filters['sectors']:
sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
sector_mask = filtered['world_bank_sector'].isin(sectors)
filtered = filtered[sector_mask]
filter_summary.append(f"Sectors: {', '.join(sectors)}")
if 'min_year' in filters and filters['min_year']:
year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
filtered = filtered[year_col >= filters['min_year']]
filter_summary.append(f"Year >= {filters['min_year']}")
if 'max_year' in filters and filters['max_year']:
year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
filtered = filtered[year_col <= filters['max_year']]
filter_summary.append(f"Year <= {filters['max_year']}")
if 'has_rct' in filters and filters['has_rct']:
filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
filter_summary.append("RCT studies only")
if 'min_sample_size' in filters and filters['min_sample_size']:
sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
filtered = filtered[sample_col >= filters['min_sample_size']]
filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
# Show results
print(f"=== FILTERED ANALYSIS ===")
print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
print(f"Studies found: {len(filtered)}/{len(docs_df)}")
if filtered.empty:
print("No studies match the criteria.")
return None
# Quick analysis of filtered data
if len(filtered) > 5:
# Show key distributions
if 'world_bank_sector' in filtered.columns:
print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
if 'research_design' in filtered.columns:
print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
if 'rigor_score' in filtered.columns:
rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
if len(rigor_scores) > 0:
print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
return filtered
# Quick start function
def quick_analysis(docs_df):
"""Run a quick analysis of the dataset"""
print("Starting comprehensive data analysis...")
# 1. Data overview
explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
# 2. Map
map_fig = create_world_map(docs_df)
# 3. Sample pivot analyses
if len(docs_df) > 0:
explore_methodology_patterns(docs_df)
explore_data_collection(docs_df)
return explorer_fig, map_fig, summary_df