dsfsi-lid-space

Sleeping

App Files Files Community

dsfsi-lid-space / app.py

vukosi

Update app.py

590ee2f verified 6 months ago

raw

history blame contribute delete

16.5 kB

	# coding=utf-8
	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import altair as alt
	from transformers import pipeline
	import fasttext
	from huggingface_hub import hf_hub_download
	import json
	import os
	import re
	import string
	import base64
	from typing import List, Tuple, Dict, Optional
	import logging

	# Configure page
	st.set_page_config(
	page_title="South African Language Identification",
	page_icon="🇿🇦",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	text-align: center;
	padding: 1rem 0;
	background: linear-gradient(90deg, #ff6b35, #f7931e);
	color: white;
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	.model-card {
	background: #f8f9fa;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #ff6b35;
	margin: 1rem 0;
	}
	.result-container {
	background: white;
	padding: 1.5rem;
	border-radius: 10px;
	box-shadow: 0 2px 10px rgba(0,0,0,0.1);
	margin: 1rem 0;
	}
	.metric-card {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 1rem;
	border-radius: 8px;
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	# Constants and Configuration
	MODEL_CONFIGS = {
	"za-bert": {
	"name": "ZA-BERT",
	"model_id": "dsfsi/za-lid-bert",
	"description": "Lightweight BERT-based model trained on South African languages",
	"recommended": True
	},
	"xlmr-large": {
	"name": "XLM-R Large",
	"model_id": "dsfsi/za-xlmrlarge-lid",
	"description": "XLM-RoBERTa Large model fine-tuned for SA languages"
	},
	"serengeti": {
	"name": "Serengeti",
	"model_id": "dsfsi/za-serengeti-lid",
	"description": "Afri-centric model with superior performance"
	},
	"afriberta": {
	"name": "AfriBERTa",
	"model_id": "dsfsi/za-afriberta-lid",
	"description": "African-focused BERT model"
	},
	"afro-xlmr": {
	"name": "Afro-XLM-R",
	"model_id": "dsfsi/za-afro-xlmr-base-lid",
	"description": "African-centric XLM-RoBERTa model"
	},
	"afrolm": {
	"name": "AfroLM",
	"model_id": "dsfsi/za-afrolm-lid",
	"description": "African language model"
	}
	}

	# Utility Functions
	@st.cache_data
	def load_language_names() -> Dict[str, str]:
	"""Load language names mapping"""
	try:
	with open("assets/language_names.json", 'r') as f:
	return json.load(f)
	except FileNotFoundError:
	# Fallback mapping for common South African languages
	return {
	"afr": "Afrikaans",
	"eng": "English",
	"nso": "Northern Sotho",
	"sot": "Sesotho",
	"ssw": "Siswati",
	"tsn": "Setswana",
	"tso": "Xitsonga",
	"ven": "Tshivenda",
	"xho": "isiXhosa",
	"zul": "isiZulu",
	"nbl": "isiNdebele",
	"und": "Undetermined"
	}

	@st.cache_resource
	def load_model(model_key: str):
	"""Load and cache models"""
	try:
	config = MODEL_CONFIGS[model_key]
	model = pipeline("text-classification", model=config["model_id"])
	return model
	except Exception as e:
	st.error(f"Error loading model {model_key}: {str(e)}")
	return None

	def preprocess_text(text: str) -> str:
	"""Clean and preprocess input text"""
	if not text or not text.strip():
	return ""

	# Basic cleaning
	text = text.replace('\n', ' ')

	# Remove problematic characters
	replacement_map = {ord(c): ' ' for c in ':•#{\|}' + string.digits}
	text = text.translate(replacement_map)

	# Normalize whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def get_language_name(label: str, lang_names: Dict[str, str]) -> str:
	"""Get language name from label"""
	if '_' in label:
	iso_code = label.split('_')[0]
	else:
	iso_code = label

	return lang_names.get(iso_code, label)

	def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]:
	"""Predict language for given text"""
	if not model or not text.strip():
	return "und", 0.0, "Undetermined"

	try:
	processed_text = preprocess_text(text)
	if not processed_text:
	return "und", 0.0, "Undetermined"

	result = model(processed_text)
	if isinstance(result, list) and len(result) > 0:
	prediction = result[0]
	label = prediction['label']
	confidence = prediction['score']
	language_name = get_language_name(label, lang_names)
	return label, confidence, language_name

	return "und", 0.0, "Undetermined"

	except Exception as e:
	st.error(f"Prediction error: {str(e)}")
	return "und", 0.0, "Error"

	def create_confidence_plot(language: str, confidence: float) -> plt.Figure:
	"""Create a confidence visualization"""
	fig, ax = plt.subplots(figsize=(10, 2))

	# Colors
	primary_color = "#ff6b35"
	bg_color = "#f8f9fa"
	text_color = "#2c3e50"

	# Create horizontal bar
	ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8)
	ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3)

	# Styling
	ax.set_xlim(0, 1)
	ax.set_ylim(-0.5, 0.5)
	ax.set_xlabel("Confidence Score", fontsize=12, color=text_color)
	ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})",
	fontsize=14, fontweight='bold', color=text_color, pad=20)

	# Remove y-axis and spines
	ax.set_yticks([])
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_visible(False)

	# Add confidence text
	ax.text(confidence/2, 0, f"{confidence:.1%}",
	ha='center', va='center', fontweight='bold', color='white')

	plt.tight_layout()
	return fig

	def render_paper_info():
	"""Render paper information and citation"""
	st.markdown("### 📄 Research Paper")

	col1, col2 = st.columns([2, 1])

	with col1:
	st.markdown("""
	"From N-grams to Pre-trained Multilingual Models For Language Identification"

	Authors: Thapelo Andrew Sindane, Vukosi Marivate

	Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024)

	This research investigates N-gram models and large pre-trained multilingual models for Language Identification
	across 11 South African languages, showing that Serengeti performs best across all model types.
	""")

	with col2:
	st.markdown("""
	Links:
	- [📖 Paper](https://aclanthology.org/2024.nlp4dh-1.22/)
	- [🤗 HuggingFace](https://huggingface.co/dsfsi)
	- [💻 GitHub](https://github.com/dsfsi/za-lid)
	""")

	def render_citation():
	"""Render BibTeX citation"""
	citation = """@inproceedings{sindane-marivate-2024-n,
	title = "From N-grams to Pre-trained Multilingual Models For Language Identification",
	author = "Sindane, Thapelo Andrew and Marivate, Vukosi",
	editor = "Hämäläinen, Mika and Öhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri",
	booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
	month = nov,
	year = "2024",
	address = "Miami, USA",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2024.nlp4dh-1.22/",
	doi = "10.18653/v1/2024.nlp4dh-1.22",
	pages = "229--239"
	}"""

	st.code(citation, language='bibtex')

	def main():
	# Header
	st.markdown("""
	<div class="main-header">
	<h1>🇿🇦 South African Language Identification</h1>
	<p>Multilingual Language Detection for South African Languages</p>
	</div>
	""", unsafe_allow_html=True)

	# Load language names
	lang_names = load_language_names()

	# Sidebar
	with st.sidebar:
	st.header("⚙️ Model Configuration")

	# Model selection
	selected_model = st.selectbox(
	"Choose Model:",
	options=list(MODEL_CONFIGS.keys()),
	format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}",
	index=0,
	help="Select the language identification model"
	)

	# Model info
	model_config = MODEL_CONFIGS[selected_model]
	st.markdown(f"""
	<div class="model-card">
	<h4>{model_config['name']}</h4>
	<p>{model_config['description']}</p>
	</div>
	""", unsafe_allow_html=True)

	# Supported languages
	st.subheader("📋 Supported Languages")
	supported_langs = [
	"🏴󠁺󠁡󠁺󠁡󠁿 Afrikaans", "🇬🇧 English", "🌍 Northern Sotho",
	"🌍 Sesotho", "🌍 Siswati", "🌍 Setswana",
	"🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa",
	"🌍 isiZulu", "🌍 isiNdebele"
	]
	for lang in supported_langs:
	st.write(f"• {lang}")

	# Main content
	tab1, tab2, tab3 = st.tabs(["🔍 Single Text", "📁 Bulk Analysis", "📄 About"])

	with tab1:
	st.header("Single Text Analysis")

	# Text input
	user_text = st.text_area(
	"Enter text to identify language:",
	placeholder="Type or paste your text here...",
	height=100,
	help="Enter text in any South African language"
	)

	col1, col2, col3 = st.columns([1, 1, 2])

	with col1:
	analyze_button = st.button("🔍 Analyze", type="primary", use_container_width=True)

	with col2:
	clear_button = st.button("🗑️ Clear", use_container_width=True)
	if clear_button:
	st.rerun()

	if analyze_button and user_text.strip():
	with st.spinner("Analyzing language..."):
	# Load model
	model = load_model(selected_model)

	if model:
	# Predict
	label, confidence, language_name = predict_language(user_text, model, lang_names)

	# Results
	st.markdown("### 📊 Results")

	# Metrics
	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown(f"""
	<div class="metric-card">
	<h3>{language_name}</h3>
	<p>Detected Language</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown(f"""
	<div class="metric-card">
	<h3>{confidence:.1%}</h3>
	<p>Confidence</p>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	st.markdown(f"""
	<div class="metric-card">
	<h3>{label}</h3>
	<p>Language Code</p>
	</div>
	""", unsafe_allow_html=True)

	# Confidence visualization
	st.markdown("### 📈 Confidence Visualization")
	fig = create_confidence_plot(language_name, confidence)
	st.pyplot(fig)

	else:
	st.error("Failed to load the model. Please try again.")

	elif analyze_button:
	st.warning("Please enter some text to analyze.")

	with tab2:
	st.header("Bulk Text Analysis")

	uploaded_file = st.file_uploader(
	"Upload a text file",
	type=['txt', 'csv'],
	help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column"
	)

	if uploaded_file:
	try:
	# Read file
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file)
	if 'text' not in df.columns:
	st.error("CSV file must contain a 'text' column")
	st.stop()
	texts = df['text'].astype(str).tolist()
	else:
	content = uploaded_file.read().decode('utf-8')
	texts = [line.strip() for line in content.split('\n') if line.strip()]

	st.success(f"Loaded {len(texts)} texts for analysis")

	if st.button("🚀 Analyze All", type="primary"):
	model = load_model(selected_model)

	if model:
	results = []
	progress_bar = st.progress(0)

	for i, text in enumerate(texts):
	label, confidence, language_name = predict_language(text, model, lang_names)
	results.append({
	'Text': text[:100] + '...' if len(text) > 100 else text,
	'Language': language_name,
	'Code': label,
	'Confidence': confidence
	})
	progress_bar.progress((i + 1) / len(texts))

	# Results DataFrame
	results_df = pd.DataFrame(results)

	# Display results
	st.markdown("### 📊 Analysis Results")
	st.dataframe(results_df, use_container_width=True)

	# Summary statistics
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("### 📈 Language Distribution")
	lang_counts = results_df['Language'].value_counts()
	st.bar_chart(lang_counts)

	with col2:
	st.markdown("### 📊 Average Confidence by Language")
	avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False)
	st.bar_chart(avg_conf)

	# Download button
	csv_data = results_df.to_csv(index=False)
	st.download_button(
	label="📥 Download Results (CSV)",
	data=csv_data,
	file_name="language_identification_results.csv",
	mime="text/csv"
	)

	else:
	st.error("Failed to load the model.")

	except Exception as e:
	st.error(f"Error processing file: {str(e)}")

	with tab3:
	render_paper_info()

	st.markdown("---")

	st.markdown("### 📖 Citation")
	render_citation()

	st.markdown("---")

	st.markdown("""
	### 🏛️ Acknowledgments

	This work is part of the Data Science for Social Impact Research Group at the University of Pretoria.

	Contact:
	- 📧 Email: vukosi.marivate@cs.up.ac.za
	- 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM)
	- 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io)
	""")

	if __name__ == "__main__":
	main()