Mamadou2727's picture
fixe tables
ca4f534
import json
import warnings
import re
import os
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.prompts import PromptTemplate
import gradio as gr
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
class ZarmaLanguageAnalyzer:
def __init__(self, grammar_path: str, glossary_path: str):
"""
Initialize the Zarma Language Analyzer with grammar rules and glossary.
Optimized for CPU usage on Hugging Face Spaces.
"""
print("Running on CPU for Hugging Face Spaces.")
self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", [])
self.glossary_data = self._load_json(glossary_path)
self._setup_models()
self._setup_vectorstore()
def _load_json(self, file_path: str) -> dict:
"""Load and parse a JSON file."""
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def _setup_models(self):
"""Set up the Gemini-2.0-flash model via Google Generative AI API."""
# Get API key from environment variable
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY environment variable not set.")
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel("gemini-2.0-flash")
self.analysis_template = PromptTemplate(
input_variables=["sentence", "grammar_check", "glossary_info", "language"],
template="""
You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}"
Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight.
Grammar check results (optional guide):
{grammar_check}
Glossary information (use it but prioritize your expertise to confirm):
{glossary_info}
Provide a detailed linguistic analysis in {language} in this exact format, with no extra text outside the sections:
1. WORD BREAKDOWN:
- [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."]
2. LINGUISTIC INSIGHT:
- Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates]
- Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)]
- Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors]
3. CORRECTNESS ASSESSMENT:
- Is the sentence correct? [Yes/No, with explanation]
- Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle]
- Corrections (depending on intended meaning):
- [Option 1: Corrected sentence with explanation]
- [Option 2: Corrected sentence with explanation]
- [Option 3: Corrected sentence with explanation]
"""
)
def _setup_vectorstore(self):
"""Set up FAISS vector store with the glossary for retrieval."""
embed_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cpu"} # Force CPU usage
)
documents = []
for entry in self.glossary_data:
fr_word = entry.get("fr", "")
dje_word = entry.get("dje", "")
notes = entry.get("notes", "No additional context available")
content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
metadata = {"fr": fr_word, "dje": dje_word, "notes": notes}
documents.append(Document(page_content=content, metadata=metadata))
self.vectorstore = FAISS.from_documents(documents, embed_model)
def check_grammar(self, sentence: str) -> list:
"""Check if the sentence violates any grammar rules."""
issues = []
for rule in self.grammar_rules:
rule_id = rule.get("rule_id", "")
category = rule.get("category", "")
subcategory = rule.get("subcategory", "")
description = rule.get("description", "")
examples = rule.get("examples", [])
for example in examples:
wrong_phrase = example.get("zarma", "")
corrected_phrase = example.get("corrected_zarma", "")
english_example = example.get("english", "")
if wrong_phrase and wrong_phrase in sentence:
explanation = (
f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. "
f"Specifically, it violates rules related to {subcategory}. "
f"The correct form would be '{corrected_phrase or 'unknown'}'. "
f"In English, this is similar to: '{english_example}'"
)
issues.append({
"rule_id": rule_id,
"category": category,
"subcategory": subcategory,
"description": description,
"wrong_phrase": wrong_phrase,
"corrected_phrase": corrected_phrase,
"english_example": english_example,
"explanation": explanation
})
return issues
def translate_and_explain_words(self, sentence: str) -> dict:
"""Break the sentence into words and find glossary entries."""
words = sentence.split()
word_info = {}
retrieved_context = []
for word in words:
clean_word = word.strip(".,!?;:()\"'")
if not clean_word:
continue
exact_match = None
for entry in self.glossary_data:
if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower():
exact_match = entry
break
if exact_match:
fr_word = exact_match.get("fr", "")
dje_word = exact_match.get("dje", "")
notes = exact_match.get("notes", "No additional context available")
word_info[clean_word] = {
"french": fr_word,
"djerma": dje_word,
"notes": notes,
"match_type": "exact"
}
context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
if context_entry not in retrieved_context:
retrieved_context.append(context_entry)
else:
search_results = self.vectorstore.similarity_search(clean_word, k=1)
if search_results:
result = search_results[0]
metadata = result.metadata
word_info[clean_word] = {
"french": metadata.get("fr", ""),
"djerma": metadata.get("dje", ""),
"notes": metadata.get("notes", "No additional context available"),
"match_type": "semantic"
}
context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}"
if context_entry not in retrieved_context:
retrieved_context.append(context_entry)
sentence_results = self.vectorstore.similarity_search(sentence, k=5)
for result in sentence_results:
context_entry = result.page_content
if context_entry not in retrieved_context:
retrieved_context.append(context_entry)
top_contexts = retrieved_context[:3]
return {"word_info": word_info, "retrieved_context": top_contexts}
def format_grammar_issues(self, issues: list) -> str:
"""Format grammar issues for display."""
if not issues:
return "No grammar issues detected."
result = "Grammar Issues Detected:\n\n"
for i, issue in enumerate(issues, 1):
result += f"Issue {i}:\n"
result += f"Rule ID: {issue.get('rule_id', '')}\n"
result += f"Category: {issue.get('category', '')}\n"
result += f"Subcategory: {issue.get('subcategory', '')}\n"
result += f"Description: {issue.get('description', '')}\n"
result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n"
result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n"
result += f"English example: {issue.get('english_example', '')}\n"
result += f"Explanation: {issue.get('explanation', '')}\n\n"
return result
def format_glossary_info(self, glossary_results: dict) -> str:
"""Format glossary information for model input."""
word_info = glossary_results.get("word_info", {})
if not word_info:
return "No glossary matches found for words in the sentence."
result = "Glossary information:\n\n"
for word, info in word_info.items():
result += f"Word: {word}\n"
result += f"French: {info.get('french', '')}\n"
result += f"Djerma: {info.get('djerma', '')}\n"
result += f"Notes: {info.get('notes', '')}\n\n"
return result
def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list:
"""Filter glossary context to only show entries reliable in the context of Gemini's analysis."""
retrieved_context = glossary_results.get("retrieved_context", [])
analysis_lower = analysis_result.lower()
reliable_context = []
for context in retrieved_context:
lines = context.split("\n")
word_line = lines[0]
word = word_line.split(": ")[1].lower()
if word in analysis_lower:
reliable_context.append(context)
return reliable_context[:3]
def extract_analysis(self, raw_output: str) -> str:
"""Extract the detailed analysis sections."""
pattern = (
r"(1\. WORD BREAKDOWN:\s*-\s*.+?)" +
r"(2\. LINGUISTIC INSIGHT:\s*-\s*Word Order:\s*.+?)" +
r"(3\. CORRECTNESS ASSESSMENT:\s*-\s*Is the sentence correct\?.+?)(?=\n\n|$)"
)
match = re.search(pattern, raw_output, re.DOTALL)
if match:
return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3)
return (
"1. WORD BREAKDOWN:\n"
" - Analysis incomplete due to model limitations.\n\n"
"2. LINGUISTIC INSIGHT:\n"
" - Word Order: Analysis incomplete.\n"
" - Tense/Aspect Markers: Analysis incomplete.\n"
" - Contextual Insight: Analysis incomplete.\n\n"
"3. CORRECTNESS ASSESSMENT:\n"
" - Is the sentence correct? Unknown due to model limitations.\n"
" - Reason for Incorrectness (if applicable): Unknown.\n"
" - Corrections: None provided."
)
def analyze_sentence(self, sentence: str, lang: str = "en") -> dict:
"""Full analysis pipeline for a Zarma sentence using Gemini-2.0-flash."""
grammar_issues = self.check_grammar(sentence)
formatted_grammar = self.format_grammar_issues(grammar_issues)
glossary_results = self.translate_and_explain_words(sentence)
formatted_glossary = self.format_glossary_info(glossary_results)
language = "English" if lang == "en" else "French"
prompt = self.analysis_template.format(
sentence=sentence,
grammar_check=formatted_grammar,
glossary_info=formatted_glossary,
language=language
)
raw_analysis = ""
try:
response = self.model.generate_content(prompt)
raw_analysis = response.text
except Exception as e:
raw_analysis = f"Error in analysis generation: {str(e)}"
analysis_result = self.extract_analysis(raw_analysis)
reliable_context = self.filter_reliable_context(glossary_results, analysis_result)
return {
"sentence": sentence,
"grammar_issues": grammar_issues,
"formatted_grammar": formatted_grammar,
"analysis_result": analysis_result,
"retrieved_context": reliable_context
}
def format_output(self, results: dict, lang: str = "en") -> str:
"""Format the analysis results for Gradio output in the selected language."""
if lang == "fr":
output = "=" * 80 + "\n"
output += "ANALYSEUR DE LANGUE ZARMA\n"
output += "=" * 80 + "\n\n"
output += f"Phrase Analysée: \"{results['sentence']}\"\n"
output += f"État de la Grammaire: {'Problèmes détectés' if results['grammar_issues'] else 'Correct'}\n\n"
output += "Analyse Détaillée:\n"
output += "-" * 80 + "\n"
output += results['analysis_result'] + "\n\n"
output += "Sources de Contexte Fiables:\n"
output += "-" * 80 + "\n"
if results["retrieved_context"]:
for i, context in enumerate(results["retrieved_context"], 1):
output += f"Source {i}:\n{context}\n\n"
else:
output += "Aucune source de contexte fiable récupérée basée sur l'analyse.\n"
output += "=" * 80
else: # Default to English
output = "=" * 80 + "\n"
output += "ZARMA LANGUAGE ANALYZER\n"
output += "=" * 80 + "\n\n"
output += f"Sentence Analyzed: \"{results['sentence']}\"\n"
output += f"Grammar Status: {'Issues detected' if results['grammar_issues'] else 'Correct'}\n\n"
output += "Detailed Analysis:\n"
output += "-" * 80 + "\n"
output += results['analysis_result'] + "\n\n"
output += "Reliable Context Sources:\n"
output += "-" * 80 + "\n"
if results["retrieved_context"]:
for i, context in enumerate(results["retrieved_context"], 1):
output += f"Source {i}:\n{context}\n\n"
else:
output += "No reliable context sources retrieved based on the analysis.\n"
output += "=" * 80
return output
# Initialize the analyzer (adjust paths to match your Hugging Face Space structure)
analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json")
# Gradio interface
def analyze_zarma_sentence(sentence, output_in_english):
if not sentence.strip():
return "Please enter a valid Zarma sentence." if output_in_english else "Veuillez entrer une phrase Zarma valide."
lang = "en" if output_in_english else "fr"
results = analyzer.analyze_sentence(sentence, lang=lang)
return analyzer.format_output(results, lang=lang)
# Define the Gradio UI
with gr.Blocks(title="Zarma Language Analyzer") as demo:
gr.Markdown("# Zarma Language Analyzer")
gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.")
sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.")
language_checkbox = gr.Checkbox(label="Output in English (uncheck for French)", value=True)
analyze_button = gr.Button("Analyze")
output_text = gr.Textbox(label="Analysis Result", lines=20)
analyze_button.click(
fn=analyze_zarma_sentence,
inputs=[sentence_input, language_checkbox],
outputs=output_text
)
# Launch the app
demo.launch(share=True)