# XLIT-TESTING import gradio as gr import pandas as pd import requests from typing import List, Dict, Union, Optional import io # YOUR EXACT IndicXlit API Code (no changes) class IndicXlitClient: """Simple client for IndicXlit Transliteration API""" def __init__(self, api_url: str = "https://awake-blowfish-liberal.ngrok-free.app"): self.api_url = api_url.rstrip('/') self.session = requests.Session() self.session.headers.update({ 'Content-Type': 'application/json', 'Accept': 'application/json' }) def health_check(self) -> dict: try: response = self.session.get(f"{self.api_url}/health") response.raise_for_status() return response.json() except Exception as e: return {"error": str(e), "status": "unhealthy"} def get_supported_languages(self) -> List[str]: try: response = self.session.get(f"{self.api_url}/languages") response.raise_for_status() data = response.json() return data.get("supported_languages", []) except Exception as e: print(f"Error getting languages: {e}") return [] def english_to_indic(self, text: str, target_languages: Union[str, List[str]], beam_width: int = 4) -> Dict[str, str]: try: payload = { "text": text, "target_languages": target_languages, "beam_width": beam_width } response = self.session.post( f"{self.api_url}/transliterate/en-to-indic", json=payload ) response.raise_for_status() result = response.json() if result.get("success"): return result.get("results", {}) else: print(f"API Error: {result}") return {} except Exception as e: print(f"Error transliterating: {e}") return {} # Create global client instance client = IndicXlitClient() # Convenience functions def transliterate_from_en(text: str, target_languages: Union[str, List[str]]) -> Dict[str, str]: return client.english_to_indic(text, target_languages) def get_supported_languages() -> List[str]: return client.get_supported_languages() def check_api_health() -> bool: health = client.health_check() return health.get("status") == "healthy" # Test API connectivity print("šŸ”„ Testing IndicXlit API connectivity...") if check_api_health(): print("āœ… IndicXlit API is healthy and ready!") supported_langs = get_supported_languages() print(f"šŸ“‹ Supported languages: {supported_langs}") print(f"šŸ“Š Total supported languages: {len(supported_langs)}") else: print("āš ļø IndicXlit API is not available") print("āŒ Please check your API URL or connection") print("āœ… IndicXlit API setup completed!") # Master language mapping for IndicXlit model testing INDICXLIT_LANGUAGE_MAPPING = { # Language name to IndicXlit API code mapping 'assamese': 'as', 'bengali': 'bn', 'bodo': 'brx', 'gujarati': 'gu', 'hindi': 'hi', 'kannada': 'kn', 'kashmiri': 'ks', 'konkani': 'gom', # IndicXlit uses 'gom' for Konkani 'maithili': 'mai', 'malayalam': 'ml', 'marathi': 'mr', 'manipuri': 'mni', 'nepali': 'ne', 'odia': 'or', 'punjabi': 'pa', 'sanskrit': 'sa', 'sindhi': 'sd', 'tamil': 'ta', 'telugu': 'te', 'urdu': 'ur' } # Languages NOT supported by IndicXlit (based on your previous testing) UNSUPPORTED_LANGUAGES = ['dogri', 'santali'] print("šŸ“‹ IndicXlit Language Mapping:") for lang_name, code in INDICXLIT_LANGUAGE_MAPPING.items(): print(f" {lang_name.capitalize()}: {code}") print(f"\nāš ļø Unsupported languages: {', '.join(UNSUPPORTED_LANGUAGES)}") print(f"āœ… Total mappings loaded: {len(INDICXLIT_LANGUAGE_MAPPING)}") from google.colab import files import pandas as pd def process_excel_dataset_with_indicxlit(): """ Process Excel dataset using ONLY IndicXlit model Input: Excel file with columns - Language, Roman Script, Native Script, English Translation Output: Excel with all ground truth columns + IndicXlit Native Output """ print("šŸ“ Please upload your Excel file containing the dataset...") uploaded = files.upload() for filename in uploaded.keys(): print(f"šŸ“„ Processing file: {filename}") # Read the Excel file try: df_input = pd.read_excel(filename) print(f"āœ… Successfully loaded Excel with {len(df_input)} rows") # Display column names to verify structure print(f"šŸ“‹ Columns found: {list(df_input.columns)}") # Identify columns (case-insensitive matching) column_mapping = {} for col in df_input.columns: col_lower = col.lower().strip() if 'language' in col_lower: column_mapping['language'] = col elif 'roman' in col_lower: column_mapping['roman'] = col elif 'native' in col_lower: column_mapping['native'] = col elif 'english' in col_lower: column_mapping['english'] = col print(f"šŸ” Column mapping: {column_mapping}") # Check if all required columns are found if len(column_mapping) < 4: print("āŒ Could not identify all required columns (Language, Roman, Native, English)") return None results = [] print(f"šŸ”„ Processing {len(df_input)} samples with IndicXlit model...") for i, row in df_input.iterrows(): language = str(row[column_mapping['language']]).lower().strip() roman_text = str(row[column_mapping['roman']]).strip() native_ground_truth = str(row[column_mapping['native']]).strip() english_text = str(row[column_mapping['english']]).strip() # Skip if language not supported if language in UNSUPPORTED_LANGUAGES: indicxlit_native_output = "NOT_SUPPORTED" status = "UNSUPPORTED_LANGUAGE" target_code = "N/A" elif language in INDICXLIT_LANGUAGE_MAPPING: target_code = INDICXLIT_LANGUAGE_MAPPING[language] try: # Use IndicXlit API for transliteration api_results = transliterate_from_en(roman_text, target_code) if api_results and target_code in api_results: indicxlit_native_output = api_results[target_code] status = "SUCCESS" else: indicxlit_native_output = roman_text # Fallback to original status = "API_FAILED" except Exception as e: indicxlit_native_output = roman_text # Fallback to original status = f"ERROR: {str(e)}" else: indicxlit_native_output = "LANGUAGE_NOT_MAPPED" status = "UNKNOWN_LANGUAGE" target_code = "N/A" # Create result row with all ground truth + IndicXlit output results.append({ 'Language': language.capitalize(), 'Roman_Script_Input': roman_text, 'Native_Script_Ground_Truth': native_ground_truth, 'English_Translation_Ground_Truth': english_text, 'IndicXlit_Native_Output': indicxlit_native_output, 'Processing_Status': status, 'IndicXlit_Code': target_code }) if (i + 1) % 50 == 0: print(f"āœ… Processed {i + 1}/{len(df_input)} samples...") # Create results DataFrame df_results = pd.DataFrame(results) # Display summary print("\nšŸ“Š Processing Summary:") print(f"Total samples processed: {len(df_results)}") print(f"Successful translations: {len(df_results[df_results['Processing_Status'] == 'SUCCESS'])}") print(f"Failed translations: {len(df_results[df_results['Processing_Status'] != 'SUCCESS'])}") # Language-wise breakdown print(f"\nšŸ“ˆ Language-wise breakdown:") lang_summary = df_results['Language'].value_counts() for lang, count in lang_summary.items(): success_count = len(df_results[(df_results['Language'] == lang) & (df_results['Processing_Status'] == 'SUCCESS')]) print(f" {lang}: {count} total, {success_count} successful") # Save to Excel output_filename = "indicxlit_excel_results_with_ground_truth.xlsx" df_results.to_excel(output_filename, index=False, engine='openpyxl') print(f"\nšŸ’¾ Results saved to: {output_filename}") # Download the file # Display first few rows print("\nšŸ“‹ Sample Results:") print(df_results.head()) return df_results except Exception as e: print(f"āŒ Error processing Excel file: {str(e)}") return None # Run the processing function print("šŸš€ Ready to process Excel dataset with IndicXlit model") print("šŸ“Š Expected Excel columns: Language, Roman Script, Native Script, English Translation") print("šŸ‘† Execute the function below to start:") print("df_results = process_excel_dataset_with_indicxlit()") df_results = process_excel_dataset_with_indicxlit()