Spaces:

27Group
/

Zarma_Language_Analyzer

Sleeping

File size: 16,813 Bytes

import json
import warnings
import re
import os
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.prompts import PromptTemplate
import gradio as gr

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

class ZarmaLanguageAnalyzer:
    def __init__(self, grammar_path: str, glossary_path: str):
        """
        Initialize the Zarma Language Analyzer with grammar rules and glossary.
        Optimized for CPU usage on Hugging Face Spaces.
        """
        print("Running on CPU for Hugging Face Spaces.")
        
        self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", [])
        self.glossary_data = self._load_json(glossary_path)
        
        self._setup_models()
        self._setup_vectorstore()
        
    def _load_json(self, file_path: str) -> dict:
        """Load and parse a JSON file."""
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    
    def _setup_models(self):
        """Set up the Gemini-2.0-flash model via Google Generative AI API."""
        # Get API key from environment variable
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("GOOGLE_API_KEY environment variable not set.")
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")
        
        self.analysis_template = PromptTemplate(
            input_variables=["sentence", "grammar_check", "glossary_info", "language"],
            template="""
            You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}"
            Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight.
            Grammar check results (optional guide):
            {grammar_check}
            Glossary information (use it but prioritize your expertise to confirm):
            {glossary_info}
            Provide a detailed linguistic analysis in {language} in this exact format, with no extra text outside the sections:
            1. WORD BREAKDOWN:
               - [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."]
            2. LINGUISTIC INSIGHT:
               - Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates]
               - Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)]
               - Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors]
            3. CORRECTNESS ASSESSMENT:
               - Is the sentence correct? [Yes/No, with explanation]
               - Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle]
               - Corrections (depending on intended meaning):
                  - [Option 1: Corrected sentence with explanation]
                  - [Option 2: Corrected sentence with explanation]
                  - [Option 3: Corrected sentence with explanation]
            """
        )
    
    def _setup_vectorstore(self):
        """Set up FAISS vector store with the glossary for retrieval."""
        embed_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={"device": "cpu"}  # Force CPU usage
        )
        
        documents = []
        for entry in self.glossary_data:
            fr_word = entry.get("fr", "")
            dje_word = entry.get("dje", "")
            notes = entry.get("notes", "No additional context available")
            
            content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
            metadata = {"fr": fr_word, "dje": dje_word, "notes": notes}
            
            documents.append(Document(page_content=content, metadata=metadata))
        
        self.vectorstore = FAISS.from_documents(documents, embed_model)
    
    def check_grammar(self, sentence: str) -> list:
        """Check if the sentence violates any grammar rules."""
        issues = []
        for rule in self.grammar_rules:
            rule_id = rule.get("rule_id", "")
            category = rule.get("category", "")
            subcategory = rule.get("subcategory", "")
            description = rule.get("description", "")
            examples = rule.get("examples", [])
            
            for example in examples:
                wrong_phrase = example.get("zarma", "")
                corrected_phrase = example.get("corrected_zarma", "")
                english_example = example.get("english", "")
                
                if wrong_phrase and wrong_phrase in sentence:
                    explanation = (
                        f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. "
                        f"Specifically, it violates rules related to {subcategory}. "
                        f"The correct form would be '{corrected_phrase or 'unknown'}'. "
                        f"In English, this is similar to: '{english_example}'"
                    )
                    issues.append({
                        "rule_id": rule_id,
                        "category": category,
                        "subcategory": subcategory,
                        "description": description,
                        "wrong_phrase": wrong_phrase,
                        "corrected_phrase": corrected_phrase,
                        "english_example": english_example,
                        "explanation": explanation
                    })
        return issues
    
    def translate_and_explain_words(self, sentence: str) -> dict:
        """Break the sentence into words and find glossary entries."""
        words = sentence.split()
        word_info = {}
        retrieved_context = []
        
        for word in words:
            clean_word = word.strip(".,!?;:()\"'")
            if not clean_word:
                continue
            
            exact_match = None
            for entry in self.glossary_data:
                if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower():
                    exact_match = entry
                    break
            
            if exact_match:
                fr_word = exact_match.get("fr", "")
                dje_word = exact_match.get("dje", "")
                notes = exact_match.get("notes", "No additional context available")
                
                word_info[clean_word] = {
                    "french": fr_word,
                    "djerma": dje_word,
                    "notes": notes,
                    "match_type": "exact"
                }
                
                context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
                if context_entry not in retrieved_context:
                    retrieved_context.append(context_entry)
            else:
                search_results = self.vectorstore.similarity_search(clean_word, k=1)
                if search_results:
                    result = search_results[0]
                    metadata = result.metadata
                    word_info[clean_word] = {
                        "french": metadata.get("fr", ""),
                        "djerma": metadata.get("dje", ""),
                        "notes": metadata.get("notes", "No additional context available"),
                        "match_type": "semantic"
                    }
                    context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}"
                    if context_entry not in retrieved_context:
                        retrieved_context.append(context_entry)
        
        sentence_results = self.vectorstore.similarity_search(sentence, k=5)
        for result in sentence_results:
            context_entry = result.page_content
            if context_entry not in retrieved_context:
                retrieved_context.append(context_entry)
        
        top_contexts = retrieved_context[:3]
        return {"word_info": word_info, "retrieved_context": top_contexts}
    
    def format_grammar_issues(self, issues: list) -> str:
        """Format grammar issues for display."""
        if not issues:
            return "No grammar issues detected."
        result = "Grammar Issues Detected:\n\n"
        for i, issue in enumerate(issues, 1):
            result += f"Issue {i}:\n"
            result += f"Rule ID: {issue.get('rule_id', '')}\n"
            result += f"Category: {issue.get('category', '')}\n"
            result += f"Subcategory: {issue.get('subcategory', '')}\n"
            result += f"Description: {issue.get('description', '')}\n"
            result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n"
            result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n"
            result += f"English example: {issue.get('english_example', '')}\n"
            result += f"Explanation: {issue.get('explanation', '')}\n\n"
        return result
    
    def format_glossary_info(self, glossary_results: dict) -> str:
        """Format glossary information for model input."""
        word_info = glossary_results.get("word_info", {})
        if not word_info:
            return "No glossary matches found for words in the sentence."
        result = "Glossary information:\n\n"
        for word, info in word_info.items():
            result += f"Word: {word}\n"
            result += f"French: {info.get('french', '')}\n"
            result += f"Djerma: {info.get('djerma', '')}\n"
            result += f"Notes: {info.get('notes', '')}\n\n"
        return result
    
    def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list:
        """Filter glossary context to only show entries reliable in the context of Gemini's analysis."""
        retrieved_context = glossary_results.get("retrieved_context", [])
        analysis_lower = analysis_result.lower()
        reliable_context = []
        
        for context in retrieved_context:
            lines = context.split("\n")
            word_line = lines[0]
            word = word_line.split(": ")[1].lower()
            
            if word in analysis_lower:
                reliable_context.append(context)
        
        return reliable_context[:3]
    
    def extract_analysis(self, raw_output: str) -> str:
        """Extract the detailed analysis sections."""
        pattern = (
            r"(1\. WORD BREAKDOWN:\s*-\s*.+?)" +
            r"(2\. LINGUISTIC INSIGHT:\s*-\s*Word Order:\s*.+?)" +
            r"(3\. CORRECTNESS ASSESSMENT:\s*-\s*Is the sentence correct\?.+?)(?=\n\n|$)"
        )
        match = re.search(pattern, raw_output, re.DOTALL)
        
        if match:
            return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3)
        
        return (
            "1. WORD BREAKDOWN:\n"
            "   - Analysis incomplete due to model limitations.\n\n"
            "2. LINGUISTIC INSIGHT:\n"
            "   - Word Order: Analysis incomplete.\n"
            "   - Tense/Aspect Markers: Analysis incomplete.\n"
            "   - Contextual Insight: Analysis incomplete.\n\n"
            "3. CORRECTNESS ASSESSMENT:\n"
            "   - Is the sentence correct? Unknown due to model limitations.\n"
            "   - Reason for Incorrectness (if applicable): Unknown.\n"
            "   - Corrections: None provided."
        )
    
    def analyze_sentence(self, sentence: str, lang: str = "en") -> dict:
        """Full analysis pipeline for a Zarma sentence using Gemini-2.0-flash."""
        grammar_issues = self.check_grammar(sentence)
        formatted_grammar = self.format_grammar_issues(grammar_issues)
        glossary_results = self.translate_and_explain_words(sentence)
        formatted_glossary = self.format_glossary_info(glossary_results)
        
        language = "English" if lang == "en" else "French"
        prompt = self.analysis_template.format(
            sentence=sentence,
            grammar_check=formatted_grammar,
            glossary_info=formatted_glossary,
            language=language
        )
        
        raw_analysis = ""
        try:
            response = self.model.generate_content(prompt)
            raw_analysis = response.text
        except Exception as e:
            raw_analysis = f"Error in analysis generation: {str(e)}"
        
        analysis_result = self.extract_analysis(raw_analysis)
        reliable_context = self.filter_reliable_context(glossary_results, analysis_result)
        
        return {
            "sentence": sentence,
            "grammar_issues": grammar_issues,
            "formatted_grammar": formatted_grammar,
            "analysis_result": analysis_result,
            "retrieved_context": reliable_context
        }
    
    def format_output(self, results: dict, lang: str = "en") -> str:
        """Format the analysis results for Gradio output in the selected language."""
        if lang == "fr":
            output = "=" * 80 + "\n"
            output += "ANALYSEUR DE LANGUE ZARMA\n"
            output += "=" * 80 + "\n\n"
            
            output += f"Phrase Analysée: \"{results['sentence']}\"\n"
            output += f"État de la Grammaire: {'Problèmes détectés' if results['grammar_issues'] else 'Correct'}\n\n"
            
            output += "Analyse Détaillée:\n"
            output += "-" * 80 + "\n"
            output += results['analysis_result'] + "\n\n"
            
            output += "Sources de Contexte Fiables:\n"
            output += "-" * 80 + "\n"
            if results["retrieved_context"]:
                for i, context in enumerate(results["retrieved_context"], 1):
                    output += f"Source {i}:\n{context}\n\n"
            else:
                output += "Aucune source de contexte fiable récupérée basée sur l'analyse.\n"
            output += "=" * 80
        else:  # Default to English
            output = "=" * 80 + "\n"
            output += "ZARMA LANGUAGE ANALYZER\n"
            output += "=" * 80 + "\n\n"
            
            output += f"Sentence Analyzed: \"{results['sentence']}\"\n"
            output += f"Grammar Status: {'Issues detected' if results['grammar_issues'] else 'Correct'}\n\n"
            
            output += "Detailed Analysis:\n"
            output += "-" * 80 + "\n"
            output += results['analysis_result'] + "\n\n"
            
            output += "Reliable Context Sources:\n"
            output += "-" * 80 + "\n"
            if results["retrieved_context"]:
                for i, context in enumerate(results["retrieved_context"], 1):
                    output += f"Source {i}:\n{context}\n\n"
            else:
                output += "No reliable context sources retrieved based on the analysis.\n"
            output += "=" * 80
        
        return output

# Initialize the analyzer (adjust paths to match your Hugging Face Space structure)
analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json")

# Gradio interface
def analyze_zarma_sentence(sentence, output_in_english):
    if not sentence.strip():
        return "Please enter a valid Zarma sentence." if output_in_english else "Veuillez entrer une phrase Zarma valide."
    lang = "en" if output_in_english else "fr"
    results = analyzer.analyze_sentence(sentence, lang=lang)
    return analyzer.format_output(results, lang=lang)

# Define the Gradio UI
with gr.Blocks(title="Zarma Language Analyzer") as demo:
    gr.Markdown("# Zarma Language Analyzer")
    gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.")
    
    sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.")
    language_checkbox = gr.Checkbox(label="Output in English (uncheck for French)", value=True)
    analyze_button = gr.Button("Analyze")
    output_text = gr.Textbox(label="Analysis Result", lines=20)
    
    analyze_button.click(
        fn=analyze_zarma_sentence,
        inputs=[sentence_input, language_checkbox],
        outputs=output_text
    )

# Launch the app
demo.launch(share=True)