Spaces:
Sleeping
Sleeping
| import json | |
| import warnings | |
| import re | |
| import os | |
| import google.generativeai as genai | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.schema import Document | |
| from langchain.prompts import PromptTemplate | |
| import gradio as gr | |
| # Suppress warnings for cleaner output | |
| warnings.filterwarnings("ignore") | |
| class ZarmaLanguageAnalyzer: | |
| def __init__(self, grammar_path: str, glossary_path: str): | |
| """ | |
| Initialize the Zarma Language Analyzer with grammar rules and glossary. | |
| Optimized for CPU usage on Hugging Face Spaces. | |
| """ | |
| print("Running on CPU for Hugging Face Spaces.") | |
| self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", []) | |
| self.glossary_data = self._load_json(glossary_path) | |
| self._setup_models() | |
| self._setup_vectorstore() | |
| def _load_json(self, file_path: str) -> dict: | |
| """Load and parse a JSON file.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def _setup_models(self): | |
| """Set up the Gemini-2.0-flash model via Google Generative AI API.""" | |
| # Get API key from environment variable | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| if not api_key: | |
| raise ValueError("GOOGLE_API_KEY environment variable not set.") | |
| genai.configure(api_key=api_key) | |
| self.model = genai.GenerativeModel("gemini-2.0-flash") | |
| self.analysis_template = PromptTemplate( | |
| input_variables=["sentence", "grammar_check", "glossary_info", "language"], | |
| template=""" | |
| You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}" | |
| Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight. | |
| Grammar check results (optional guide): | |
| {grammar_check} | |
| Glossary information (use it but prioritize your expertise to confirm): | |
| {glossary_info} | |
| Provide a detailed linguistic analysis in {language} in this exact format, with no extra text outside the sections: | |
| 1. WORD BREAKDOWN: | |
| - [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."] | |
| 2. LINGUISTIC INSIGHT: | |
| - Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates] | |
| - Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)] | |
| - Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors] | |
| 3. CORRECTNESS ASSESSMENT: | |
| - Is the sentence correct? [Yes/No, with explanation] | |
| - Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle] | |
| - Corrections (depending on intended meaning): | |
| - [Option 1: Corrected sentence with explanation] | |
| - [Option 2: Corrected sentence with explanation] | |
| - [Option 3: Corrected sentence with explanation] | |
| """ | |
| ) | |
| def _setup_vectorstore(self): | |
| """Set up FAISS vector store with the glossary for retrieval.""" | |
| embed_model = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={"device": "cpu"} # Force CPU usage | |
| ) | |
| documents = [] | |
| for entry in self.glossary_data: | |
| fr_word = entry.get("fr", "") | |
| dje_word = entry.get("dje", "") | |
| notes = entry.get("notes", "No additional context available") | |
| content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}" | |
| metadata = {"fr": fr_word, "dje": dje_word, "notes": notes} | |
| documents.append(Document(page_content=content, metadata=metadata)) | |
| self.vectorstore = FAISS.from_documents(documents, embed_model) | |
| def check_grammar(self, sentence: str) -> list: | |
| """Check if the sentence violates any grammar rules.""" | |
| issues = [] | |
| for rule in self.grammar_rules: | |
| rule_id = rule.get("rule_id", "") | |
| category = rule.get("category", "") | |
| subcategory = rule.get("subcategory", "") | |
| description = rule.get("description", "") | |
| examples = rule.get("examples", []) | |
| for example in examples: | |
| wrong_phrase = example.get("zarma", "") | |
| corrected_phrase = example.get("corrected_zarma", "") | |
| english_example = example.get("english", "") | |
| if wrong_phrase and wrong_phrase in sentence: | |
| explanation = ( | |
| f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. " | |
| f"Specifically, it violates rules related to {subcategory}. " | |
| f"The correct form would be '{corrected_phrase or 'unknown'}'. " | |
| f"In English, this is similar to: '{english_example}'" | |
| ) | |
| issues.append({ | |
| "rule_id": rule_id, | |
| "category": category, | |
| "subcategory": subcategory, | |
| "description": description, | |
| "wrong_phrase": wrong_phrase, | |
| "corrected_phrase": corrected_phrase, | |
| "english_example": english_example, | |
| "explanation": explanation | |
| }) | |
| return issues | |
| def translate_and_explain_words(self, sentence: str) -> dict: | |
| """Break the sentence into words and find glossary entries.""" | |
| words = sentence.split() | |
| word_info = {} | |
| retrieved_context = [] | |
| for word in words: | |
| clean_word = word.strip(".,!?;:()\"'") | |
| if not clean_word: | |
| continue | |
| exact_match = None | |
| for entry in self.glossary_data: | |
| if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower(): | |
| exact_match = entry | |
| break | |
| if exact_match: | |
| fr_word = exact_match.get("fr", "") | |
| dje_word = exact_match.get("dje", "") | |
| notes = exact_match.get("notes", "No additional context available") | |
| word_info[clean_word] = { | |
| "french": fr_word, | |
| "djerma": dje_word, | |
| "notes": notes, | |
| "match_type": "exact" | |
| } | |
| context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}" | |
| if context_entry not in retrieved_context: | |
| retrieved_context.append(context_entry) | |
| else: | |
| search_results = self.vectorstore.similarity_search(clean_word, k=1) | |
| if search_results: | |
| result = search_results[0] | |
| metadata = result.metadata | |
| word_info[clean_word] = { | |
| "french": metadata.get("fr", ""), | |
| "djerma": metadata.get("dje", ""), | |
| "notes": metadata.get("notes", "No additional context available"), | |
| "match_type": "semantic" | |
| } | |
| context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}" | |
| if context_entry not in retrieved_context: | |
| retrieved_context.append(context_entry) | |
| sentence_results = self.vectorstore.similarity_search(sentence, k=5) | |
| for result in sentence_results: | |
| context_entry = result.page_content | |
| if context_entry not in retrieved_context: | |
| retrieved_context.append(context_entry) | |
| top_contexts = retrieved_context[:3] | |
| return {"word_info": word_info, "retrieved_context": top_contexts} | |
| def format_grammar_issues(self, issues: list) -> str: | |
| """Format grammar issues for display.""" | |
| if not issues: | |
| return "No grammar issues detected." | |
| result = "Grammar Issues Detected:\n\n" | |
| for i, issue in enumerate(issues, 1): | |
| result += f"Issue {i}:\n" | |
| result += f"Rule ID: {issue.get('rule_id', '')}\n" | |
| result += f"Category: {issue.get('category', '')}\n" | |
| result += f"Subcategory: {issue.get('subcategory', '')}\n" | |
| result += f"Description: {issue.get('description', '')}\n" | |
| result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n" | |
| result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n" | |
| result += f"English example: {issue.get('english_example', '')}\n" | |
| result += f"Explanation: {issue.get('explanation', '')}\n\n" | |
| return result | |
| def format_glossary_info(self, glossary_results: dict) -> str: | |
| """Format glossary information for model input.""" | |
| word_info = glossary_results.get("word_info", {}) | |
| if not word_info: | |
| return "No glossary matches found for words in the sentence." | |
| result = "Glossary information:\n\n" | |
| for word, info in word_info.items(): | |
| result += f"Word: {word}\n" | |
| result += f"French: {info.get('french', '')}\n" | |
| result += f"Djerma: {info.get('djerma', '')}\n" | |
| result += f"Notes: {info.get('notes', '')}\n\n" | |
| return result | |
| def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list: | |
| """Filter glossary context to only show entries reliable in the context of Gemini's analysis.""" | |
| retrieved_context = glossary_results.get("retrieved_context", []) | |
| analysis_lower = analysis_result.lower() | |
| reliable_context = [] | |
| for context in retrieved_context: | |
| lines = context.split("\n") | |
| word_line = lines[0] | |
| word = word_line.split(": ")[1].lower() | |
| if word in analysis_lower: | |
| reliable_context.append(context) | |
| return reliable_context[:3] | |
| def extract_analysis(self, raw_output: str) -> str: | |
| """Extract the detailed analysis sections.""" | |
| pattern = ( | |
| r"(1\. WORD BREAKDOWN:\s*-\s*.+?)" + | |
| r"(2\. LINGUISTIC INSIGHT:\s*-\s*Word Order:\s*.+?)" + | |
| r"(3\. CORRECTNESS ASSESSMENT:\s*-\s*Is the sentence correct\?.+?)(?=\n\n|$)" | |
| ) | |
| match = re.search(pattern, raw_output, re.DOTALL) | |
| if match: | |
| return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3) | |
| return ( | |
| "1. WORD BREAKDOWN:\n" | |
| " - Analysis incomplete due to model limitations.\n\n" | |
| "2. LINGUISTIC INSIGHT:\n" | |
| " - Word Order: Analysis incomplete.\n" | |
| " - Tense/Aspect Markers: Analysis incomplete.\n" | |
| " - Contextual Insight: Analysis incomplete.\n\n" | |
| "3. CORRECTNESS ASSESSMENT:\n" | |
| " - Is the sentence correct? Unknown due to model limitations.\n" | |
| " - Reason for Incorrectness (if applicable): Unknown.\n" | |
| " - Corrections: None provided." | |
| ) | |
| def analyze_sentence(self, sentence: str, lang: str = "en") -> dict: | |
| """Full analysis pipeline for a Zarma sentence using Gemini-2.0-flash.""" | |
| grammar_issues = self.check_grammar(sentence) | |
| formatted_grammar = self.format_grammar_issues(grammar_issues) | |
| glossary_results = self.translate_and_explain_words(sentence) | |
| formatted_glossary = self.format_glossary_info(glossary_results) | |
| language = "English" if lang == "en" else "French" | |
| prompt = self.analysis_template.format( | |
| sentence=sentence, | |
| grammar_check=formatted_grammar, | |
| glossary_info=formatted_glossary, | |
| language=language | |
| ) | |
| raw_analysis = "" | |
| try: | |
| response = self.model.generate_content(prompt) | |
| raw_analysis = response.text | |
| except Exception as e: | |
| raw_analysis = f"Error in analysis generation: {str(e)}" | |
| analysis_result = self.extract_analysis(raw_analysis) | |
| reliable_context = self.filter_reliable_context(glossary_results, analysis_result) | |
| return { | |
| "sentence": sentence, | |
| "grammar_issues": grammar_issues, | |
| "formatted_grammar": formatted_grammar, | |
| "analysis_result": analysis_result, | |
| "retrieved_context": reliable_context | |
| } | |
| def format_output(self, results: dict, lang: str = "en") -> str: | |
| """Format the analysis results for Gradio output in the selected language.""" | |
| if lang == "fr": | |
| output = "=" * 80 + "\n" | |
| output += "ANALYSEUR DE LANGUE ZARMA\n" | |
| output += "=" * 80 + "\n\n" | |
| output += f"Phrase Analysée: \"{results['sentence']}\"\n" | |
| output += f"État de la Grammaire: {'Problèmes détectés' if results['grammar_issues'] else 'Correct'}\n\n" | |
| output += "Analyse Détaillée:\n" | |
| output += "-" * 80 + "\n" | |
| output += results['analysis_result'] + "\n\n" | |
| output += "Sources de Contexte Fiables:\n" | |
| output += "-" * 80 + "\n" | |
| if results["retrieved_context"]: | |
| for i, context in enumerate(results["retrieved_context"], 1): | |
| output += f"Source {i}:\n{context}\n\n" | |
| else: | |
| output += "Aucune source de contexte fiable récupérée basée sur l'analyse.\n" | |
| output += "=" * 80 | |
| else: # Default to English | |
| output = "=" * 80 + "\n" | |
| output += "ZARMA LANGUAGE ANALYZER\n" | |
| output += "=" * 80 + "\n\n" | |
| output += f"Sentence Analyzed: \"{results['sentence']}\"\n" | |
| output += f"Grammar Status: {'Issues detected' if results['grammar_issues'] else 'Correct'}\n\n" | |
| output += "Detailed Analysis:\n" | |
| output += "-" * 80 + "\n" | |
| output += results['analysis_result'] + "\n\n" | |
| output += "Reliable Context Sources:\n" | |
| output += "-" * 80 + "\n" | |
| if results["retrieved_context"]: | |
| for i, context in enumerate(results["retrieved_context"], 1): | |
| output += f"Source {i}:\n{context}\n\n" | |
| else: | |
| output += "No reliable context sources retrieved based on the analysis.\n" | |
| output += "=" * 80 | |
| return output | |
| # Initialize the analyzer (adjust paths to match your Hugging Face Space structure) | |
| analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json") | |
| # Gradio interface | |
| def analyze_zarma_sentence(sentence, output_in_english): | |
| if not sentence.strip(): | |
| return "Please enter a valid Zarma sentence." if output_in_english else "Veuillez entrer une phrase Zarma valide." | |
| lang = "en" if output_in_english else "fr" | |
| results = analyzer.analyze_sentence(sentence, lang=lang) | |
| return analyzer.format_output(results, lang=lang) | |
| # Define the Gradio UI | |
| with gr.Blocks(title="Zarma Language Analyzer") as demo: | |
| gr.Markdown("# Zarma Language Analyzer") | |
| gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.") | |
| sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.") | |
| language_checkbox = gr.Checkbox(label="Output in English (uncheck for French)", value=True) | |
| analyze_button = gr.Button("Analyze") | |
| output_text = gr.Textbox(label="Analysis Result", lines=20) | |
| analyze_button.click( | |
| fn=analyze_zarma_sentence, | |
| inputs=[sentence_input, language_checkbox], | |
| outputs=output_text | |
| ) | |
| # Launch the app | |
| demo.launch(share=True) | |