Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| # coding=utf-8 | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import altair as alt | |
| from transformers import pipeline | |
| import fasttext | |
| from huggingface_hub import hf_hub_download | |
| import json | |
| import os | |
| import re | |
| import string | |
| import base64 | |
| from typing import List, Tuple, Dict, Optional | |
| import logging | |
| # Configure page | |
| st.set_page_config( | |
| page_title="South African Language Identification", | |
| page_icon="πΏπ¦", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| text-align: center; | |
| padding: 1rem 0; | |
| background: linear-gradient(90deg, #ff6b35, #f7931e); | |
| color: white; | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| .model-card { | |
| background: #f8f9fa; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #ff6b35; | |
| margin: 1rem 0; | |
| } | |
| .result-container { | |
| background: white; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
| margin: 1rem 0; | |
| } | |
| .metric-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| text-align: center; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Constants and Configuration | |
| MODEL_CONFIGS = { | |
| "za-bert": { | |
| "name": "ZA-BERT", | |
| "model_id": "dsfsi/za-lid-bert", | |
| "description": "Lightweight BERT-based model trained on South African languages", | |
| "recommended": True | |
| }, | |
| "xlmr-large": { | |
| "name": "XLM-R Large", | |
| "model_id": "dsfsi/za-xlmrlarge-lid", | |
| "description": "XLM-RoBERTa Large model fine-tuned for SA languages" | |
| }, | |
| "serengeti": { | |
| "name": "Serengeti", | |
| "model_id": "dsfsi/za-serengeti-lid", | |
| "description": "Afri-centric model with superior performance" | |
| }, | |
| "afriberta": { | |
| "name": "AfriBERTa", | |
| "model_id": "dsfsi/za-afriberta-lid", | |
| "description": "African-focused BERT model" | |
| }, | |
| "afro-xlmr": { | |
| "name": "Afro-XLM-R", | |
| "model_id": "dsfsi/za-afro-xlmr-base-lid", | |
| "description": "African-centric XLM-RoBERTa model" | |
| }, | |
| "afrolm": { | |
| "name": "AfroLM", | |
| "model_id": "dsfsi/za-afrolm-lid", | |
| "description": "African language model" | |
| } | |
| } | |
| # Utility Functions | |
| def load_language_names() -> Dict[str, str]: | |
| """Load language names mapping""" | |
| try: | |
| with open("assets/language_names.json", 'r') as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| # Fallback mapping for common South African languages | |
| return { | |
| "afr": "Afrikaans", | |
| "eng": "English", | |
| "nso": "Northern Sotho", | |
| "sot": "Sesotho", | |
| "ssw": "Siswati", | |
| "tsn": "Setswana", | |
| "tso": "Xitsonga", | |
| "ven": "Tshivenda", | |
| "xho": "isiXhosa", | |
| "zul": "isiZulu", | |
| "nbl": "isiNdebele", | |
| "und": "Undetermined" | |
| } | |
| def load_model(model_key: str): | |
| """Load and cache models""" | |
| try: | |
| config = MODEL_CONFIGS[model_key] | |
| model = pipeline("text-classification", model=config["model_id"]) | |
| return model | |
| except Exception as e: | |
| st.error(f"Error loading model {model_key}: {str(e)}") | |
| return None | |
| def preprocess_text(text: str) -> str: | |
| """Clean and preprocess input text""" | |
| if not text or not text.strip(): | |
| return "" | |
| # Basic cleaning | |
| text = text.replace('\n', ' ') | |
| # Remove problematic characters | |
| replacement_map = {ord(c): ' ' for c in ':β’#{|}' + string.digits} | |
| text = text.translate(replacement_map) | |
| # Normalize whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def get_language_name(label: str, lang_names: Dict[str, str]) -> str: | |
| """Get language name from label""" | |
| if '_' in label: | |
| iso_code = label.split('_')[0] | |
| else: | |
| iso_code = label | |
| return lang_names.get(iso_code, label) | |
| def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]: | |
| """Predict language for given text""" | |
| if not model or not text.strip(): | |
| return "und", 0.0, "Undetermined" | |
| try: | |
| processed_text = preprocess_text(text) | |
| if not processed_text: | |
| return "und", 0.0, "Undetermined" | |
| result = model(processed_text) | |
| if isinstance(result, list) and len(result) > 0: | |
| prediction = result[0] | |
| label = prediction['label'] | |
| confidence = prediction['score'] | |
| language_name = get_language_name(label, lang_names) | |
| return label, confidence, language_name | |
| return "und", 0.0, "Undetermined" | |
| except Exception as e: | |
| st.error(f"Prediction error: {str(e)}") | |
| return "und", 0.0, "Error" | |
| def create_confidence_plot(language: str, confidence: float) -> plt.Figure: | |
| """Create a confidence visualization""" | |
| fig, ax = plt.subplots(figsize=(10, 2)) | |
| # Colors | |
| primary_color = "#ff6b35" | |
| bg_color = "#f8f9fa" | |
| text_color = "#2c3e50" | |
| # Create horizontal bar | |
| ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8) | |
| ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3) | |
| # Styling | |
| ax.set_xlim(0, 1) | |
| ax.set_ylim(-0.5, 0.5) | |
| ax.set_xlabel("Confidence Score", fontsize=12, color=text_color) | |
| ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})", | |
| fontsize=14, fontweight='bold', color=text_color, pad=20) | |
| # Remove y-axis and spines | |
| ax.set_yticks([]) | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| ax.spines['left'].set_visible(False) | |
| # Add confidence text | |
| ax.text(confidence/2, 0, f"{confidence:.1%}", | |
| ha='center', va='center', fontweight='bold', color='white') | |
| plt.tight_layout() | |
| return fig | |
| def render_paper_info(): | |
| """Render paper information and citation""" | |
| st.markdown("### π Research Paper") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.markdown(""" | |
| **"From N-grams to Pre-trained Multilingual Models For Language Identification"** | |
| *Authors: Thapelo Andrew Sindane, Vukosi Marivate* | |
| Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024) | |
| This research investigates N-gram models and large pre-trained multilingual models for Language Identification | |
| across 11 South African languages, showing that Serengeti performs best across all model types. | |
| """) | |
| with col2: | |
| st.markdown(""" | |
| **Links:** | |
| - [π Paper](https://aclanthology.org/2024.nlp4dh-1.22/) | |
| - [π€ HuggingFace](https://huggingface.co/dsfsi) | |
| - [π» GitHub](https://github.com/dsfsi/za-lid) | |
| """) | |
| def render_citation(): | |
| """Render BibTeX citation""" | |
| citation = """@inproceedings{sindane-marivate-2024-n, | |
| title = "From N-grams to Pre-trained Multilingual Models For Language Identification", | |
| author = "Sindane, Thapelo Andrew and Marivate, Vukosi", | |
| editor = "HΓ€mΓ€lΓ€inen, Mika and Γhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri", | |
| booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities", | |
| month = nov, | |
| year = "2024", | |
| address = "Miami, USA", | |
| publisher = "Association for Computational Linguistics", | |
| url = "https://aclanthology.org/2024.nlp4dh-1.22/", | |
| doi = "10.18653/v1/2024.nlp4dh-1.22", | |
| pages = "229--239" | |
| }""" | |
| st.code(citation, language='bibtex') | |
| def main(): | |
| # Header | |
| st.markdown(""" | |
| <div class="main-header"> | |
| <h1>πΏπ¦ South African Language Identification</h1> | |
| <p>Multilingual Language Detection for South African Languages</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Load language names | |
| lang_names = load_language_names() | |
| # Sidebar | |
| with st.sidebar: | |
| st.header("βοΈ Model Configuration") | |
| # Model selection | |
| selected_model = st.selectbox( | |
| "Choose Model:", | |
| options=list(MODEL_CONFIGS.keys()), | |
| format_func=lambda x: f"{'β ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}", | |
| index=0, | |
| help="Select the language identification model" | |
| ) | |
| # Model info | |
| model_config = MODEL_CONFIGS[selected_model] | |
| st.markdown(f""" | |
| <div class="model-card"> | |
| <h4>{model_config['name']}</h4> | |
| <p>{model_config['description']}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Supported languages | |
| st.subheader("π Supported Languages") | |
| supported_langs = [ | |
| "π΄σ Ίσ ‘σ Ίσ ‘σ Ώ Afrikaans", "π¬π§ English", "π Northern Sotho", | |
| "π Sesotho", "π Siswati", "π Setswana", | |
| "π Xitsonga", "π Tshivenda", "π isiXhosa", | |
| "π isiZulu", "π isiNdebele" | |
| ] | |
| for lang in supported_langs: | |
| st.write(f"β’ {lang}") | |
| # Main content | |
| tab1, tab2, tab3 = st.tabs(["π Single Text", "π Bulk Analysis", "π About"]) | |
| with tab1: | |
| st.header("Single Text Analysis") | |
| # Text input | |
| user_text = st.text_area( | |
| "Enter text to identify language:", | |
| placeholder="Type or paste your text here...", | |
| height=100, | |
| help="Enter text in any South African language" | |
| ) | |
| col1, col2, col3 = st.columns([1, 1, 2]) | |
| with col1: | |
| analyze_button = st.button("π Analyze", type="primary", use_container_width=True) | |
| with col2: | |
| clear_button = st.button("ποΈ Clear", use_container_width=True) | |
| if clear_button: | |
| st.rerun() | |
| if analyze_button and user_text.strip(): | |
| with st.spinner("Analyzing language..."): | |
| # Load model | |
| model = load_model(selected_model) | |
| if model: | |
| # Predict | |
| label, confidence, language_name = predict_language(user_text, model, lang_names) | |
| # Results | |
| st.markdown("### π Results") | |
| # Metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <h3>{language_name}</h3> | |
| <p>Detected Language</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <h3>{confidence:.1%}</h3> | |
| <p>Confidence</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col3: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <h3>{label}</h3> | |
| <p>Language Code</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Confidence visualization | |
| st.markdown("### π Confidence Visualization") | |
| fig = create_confidence_plot(language_name, confidence) | |
| st.pyplot(fig) | |
| else: | |
| st.error("Failed to load the model. Please try again.") | |
| elif analyze_button: | |
| st.warning("Please enter some text to analyze.") | |
| with tab2: | |
| st.header("Bulk Text Analysis") | |
| uploaded_file = st.file_uploader( | |
| "Upload a text file", | |
| type=['txt', 'csv'], | |
| help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column" | |
| ) | |
| if uploaded_file: | |
| try: | |
| # Read file | |
| if uploaded_file.name.endswith('.csv'): | |
| df = pd.read_csv(uploaded_file) | |
| if 'text' not in df.columns: | |
| st.error("CSV file must contain a 'text' column") | |
| st.stop() | |
| texts = df['text'].astype(str).tolist() | |
| else: | |
| content = uploaded_file.read().decode('utf-8') | |
| texts = [line.strip() for line in content.split('\n') if line.strip()] | |
| st.success(f"Loaded {len(texts)} texts for analysis") | |
| if st.button("π Analyze All", type="primary"): | |
| model = load_model(selected_model) | |
| if model: | |
| results = [] | |
| progress_bar = st.progress(0) | |
| for i, text in enumerate(texts): | |
| label, confidence, language_name = predict_language(text, model, lang_names) | |
| results.append({ | |
| 'Text': text[:100] + '...' if len(text) > 100 else text, | |
| 'Language': language_name, | |
| 'Code': label, | |
| 'Confidence': confidence | |
| }) | |
| progress_bar.progress((i + 1) / len(texts)) | |
| # Results DataFrame | |
| results_df = pd.DataFrame(results) | |
| # Display results | |
| st.markdown("### π Analysis Results") | |
| st.dataframe(results_df, use_container_width=True) | |
| # Summary statistics | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("### π Language Distribution") | |
| lang_counts = results_df['Language'].value_counts() | |
| st.bar_chart(lang_counts) | |
| with col2: | |
| st.markdown("### π Average Confidence by Language") | |
| avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False) | |
| st.bar_chart(avg_conf) | |
| # Download button | |
| csv_data = results_df.to_csv(index=False) | |
| st.download_button( | |
| label="π₯ Download Results (CSV)", | |
| data=csv_data, | |
| file_name="language_identification_results.csv", | |
| mime="text/csv" | |
| ) | |
| else: | |
| st.error("Failed to load the model.") | |
| except Exception as e: | |
| st.error(f"Error processing file: {str(e)}") | |
| with tab3: | |
| render_paper_info() | |
| st.markdown("---") | |
| st.markdown("### π Citation") | |
| render_citation() | |
| st.markdown("---") | |
| st.markdown(""" | |
| ### ποΈ Acknowledgments | |
| This work is part of the Data Science for Social Impact Research Group at the University of Pretoria. | |
| **Contact:** | |
| - π§ Email: vukosi.marivate@cs.up.ac.za | |
| - π¦ Twitter: [@VukosiiM](https://twitter.com/VukosiiM) | |
| - π Website: [dsfsi.github.io](https://dsfsi.github.io) | |
| """) | |
| if __name__ == "__main__": | |
| main() | 
 
			

