Spaces:

27Group
/

Zarma_Language_Analyzer

Sleeping

App Files Files Community

Zarma_Language_Analyzer / app.py

Mamadou2727

fixe tables

ca4f534 7 months ago

raw

history blame contribute delete

16.8 kB

	import json
	import warnings
	import re
	import os
	import google.generativeai as genai
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.schema import Document
	from langchain.prompts import PromptTemplate
	import gradio as gr

	# Suppress warnings for cleaner output
	warnings.filterwarnings("ignore")

	class ZarmaLanguageAnalyzer:
	def __init__(self, grammar_path: str, glossary_path: str):
	"""
	Initialize the Zarma Language Analyzer with grammar rules and glossary.
	Optimized for CPU usage on Hugging Face Spaces.
	"""
	print("Running on CPU for Hugging Face Spaces.")

	self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", [])
	self.glossary_data = self._load_json(glossary_path)

	self._setup_models()
	self._setup_vectorstore()

	def _load_json(self, file_path: str) -> dict:
	"""Load and parse a JSON file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	return json.load(f)

	def _setup_models(self):
	"""Set up the Gemini-2.0-flash model via Google Generative AI API."""
	# Get API key from environment variable
	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	raise ValueError("GOOGLE_API_KEY environment variable not set.")
	genai.configure(api_key=api_key)
	self.model = genai.GenerativeModel("gemini-2.0-flash")

	self.analysis_template = PromptTemplate(
	input_variables=["sentence", "grammar_check", "glossary_info", "language"],
	template="""
	You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}"
	Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight.
	Grammar check results (optional guide):
	{grammar_check}
	Glossary information (use it but prioritize your expertise to confirm):
	{glossary_info}
	Provide a detailed linguistic analysis in {language} in this exact format, with no extra text outside the sections:
	1. WORD BREAKDOWN:
	- [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."]
	2. LINGUISTIC INSIGHT:
	- Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates]
	- Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)]
	- Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors]
	3. CORRECTNESS ASSESSMENT:
	- Is the sentence correct? [Yes/No, with explanation]
	- Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle]
	- Corrections (depending on intended meaning):
	- [Option 1: Corrected sentence with explanation]
	- [Option 2: Corrected sentence with explanation]
	- [Option 3: Corrected sentence with explanation]
	"""
	)

	def _setup_vectorstore(self):
	"""Set up FAISS vector store with the glossary for retrieval."""
	embed_model = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={"device": "cpu"} # Force CPU usage
	)

	documents = []
	for entry in self.glossary_data:
	fr_word = entry.get("fr", "")
	dje_word = entry.get("dje", "")
	notes = entry.get("notes", "No additional context available")

	content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
	metadata = {"fr": fr_word, "dje": dje_word, "notes": notes}

	documents.append(Document(page_content=content, metadata=metadata))

	self.vectorstore = FAISS.from_documents(documents, embed_model)

	def check_grammar(self, sentence: str) -> list:
	"""Check if the sentence violates any grammar rules."""
	issues = []
	for rule in self.grammar_rules:
	rule_id = rule.get("rule_id", "")
	category = rule.get("category", "")
	subcategory = rule.get("subcategory", "")
	description = rule.get("description", "")
	examples = rule.get("examples", [])

	for example in examples:
	wrong_phrase = example.get("zarma", "")
	corrected_phrase = example.get("corrected_zarma", "")
	english_example = example.get("english", "")

	if wrong_phrase and wrong_phrase in sentence:
	explanation = (
	f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. "
	f"Specifically, it violates rules related to {subcategory}. "
	f"The correct form would be '{corrected_phrase or 'unknown'}'. "
	f"In English, this is similar to: '{english_example}'"
	)
	issues.append({
	"rule_id": rule_id,
	"category": category,
	"subcategory": subcategory,
	"description": description,
	"wrong_phrase": wrong_phrase,
	"corrected_phrase": corrected_phrase,
	"english_example": english_example,
	"explanation": explanation
	})
	return issues

	def translate_and_explain_words(self, sentence: str) -> dict:
	"""Break the sentence into words and find glossary entries."""
	words = sentence.split()
	word_info = {}
	retrieved_context = []

	for word in words:
	clean_word = word.strip(".,!?;:()\"'")
	if not clean_word:
	continue

	exact_match = None
	for entry in self.glossary_data:
	if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower():
	exact_match = entry
	break

	if exact_match:
	fr_word = exact_match.get("fr", "")
	dje_word = exact_match.get("dje", "")
	notes = exact_match.get("notes", "No additional context available")

	word_info[clean_word] = {
	"french": fr_word,
	"djerma": dje_word,
	"notes": notes,
	"match_type": "exact"
	}

	context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
	if context_entry not in retrieved_context:
	retrieved_context.append(context_entry)
	else:
	search_results = self.vectorstore.similarity_search(clean_word, k=1)
	if search_results:
	result = search_results[0]
	metadata = result.metadata
	word_info[clean_word] = {
	"french": metadata.get("fr", ""),
	"djerma": metadata.get("dje", ""),
	"notes": metadata.get("notes", "No additional context available"),
	"match_type": "semantic"
	}
	context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}"
	if context_entry not in retrieved_context:
	retrieved_context.append(context_entry)

	sentence_results = self.vectorstore.similarity_search(sentence, k=5)
	for result in sentence_results:
	context_entry = result.page_content
	if context_entry not in retrieved_context:
	retrieved_context.append(context_entry)

	top_contexts = retrieved_context[:3]
	return {"word_info": word_info, "retrieved_context": top_contexts}

	def format_grammar_issues(self, issues: list) -> str:
	"""Format grammar issues for display."""
	if not issues:
	return "No grammar issues detected."
	result = "Grammar Issues Detected:\n\n"
	for i, issue in enumerate(issues, 1):
	result += f"Issue {i}:\n"
	result += f"Rule ID: {issue.get('rule_id', '')}\n"
	result += f"Category: {issue.get('category', '')}\n"
	result += f"Subcategory: {issue.get('subcategory', '')}\n"
	result += f"Description: {issue.get('description', '')}\n"
	result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n"
	result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n"
	result += f"English example: {issue.get('english_example', '')}\n"
	result += f"Explanation: {issue.get('explanation', '')}\n\n"
	return result

	def format_glossary_info(self, glossary_results: dict) -> str:
	"""Format glossary information for model input."""
	word_info = glossary_results.get("word_info", {})
	if not word_info:
	return "No glossary matches found for words in the sentence."
	result = "Glossary information:\n\n"
	for word, info in word_info.items():
	result += f"Word: {word}\n"
	result += f"French: {info.get('french', '')}\n"
	result += f"Djerma: {info.get('djerma', '')}\n"
	result += f"Notes: {info.get('notes', '')}\n\n"
	return result

	def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list:
	"""Filter glossary context to only show entries reliable in the context of Gemini's analysis."""
	retrieved_context = glossary_results.get("retrieved_context", [])
	analysis_lower = analysis_result.lower()
	reliable_context = []

	for context in retrieved_context:
	lines = context.split("\n")
	word_line = lines[0]
	word = word_line.split(": ")[1].lower()

	if word in analysis_lower:
	reliable_context.append(context)

	return reliable_context[:3]

	def extract_analysis(self, raw_output: str) -> str:
	"""Extract the detailed analysis sections."""
	pattern = (
	r"(1\. WORD BREAKDOWN:\s-\s.+?)" +
	r"(2\. LINGUISTIC INSIGHT:\s-\sWord Order:\s*.+?)" +
	r"(3\. CORRECTNESS ASSESSMENT:\s-\sIs the sentence correct\?.+?)(?=\n\n\|$)"
	)
	match = re.search(pattern, raw_output, re.DOTALL)

	if match:
	return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3)

	return (
	"1. WORD BREAKDOWN:\n"
	" - Analysis incomplete due to model limitations.\n\n"
	"2. LINGUISTIC INSIGHT:\n"
	" - Word Order: Analysis incomplete.\n"
	" - Tense/Aspect Markers: Analysis incomplete.\n"
	" - Contextual Insight: Analysis incomplete.\n\n"
	"3. CORRECTNESS ASSESSMENT:\n"
	" - Is the sentence correct? Unknown due to model limitations.\n"
	" - Reason for Incorrectness (if applicable): Unknown.\n"
	" - Corrections: None provided."
	)

	def analyze_sentence(self, sentence: str, lang: str = "en") -> dict:
	"""Full analysis pipeline for a Zarma sentence using Gemini-2.0-flash."""
	grammar_issues = self.check_grammar(sentence)
	formatted_grammar = self.format_grammar_issues(grammar_issues)
	glossary_results = self.translate_and_explain_words(sentence)
	formatted_glossary = self.format_glossary_info(glossary_results)

	language = "English" if lang == "en" else "French"
	prompt = self.analysis_template.format(
	sentence=sentence,
	grammar_check=formatted_grammar,
	glossary_info=formatted_glossary,
	language=language
	)

	raw_analysis = ""
	try:
	response = self.model.generate_content(prompt)
	raw_analysis = response.text
	except Exception as e:
	raw_analysis = f"Error in analysis generation: {str(e)}"

	analysis_result = self.extract_analysis(raw_analysis)
	reliable_context = self.filter_reliable_context(glossary_results, analysis_result)

	return {
	"sentence": sentence,
	"grammar_issues": grammar_issues,
	"formatted_grammar": formatted_grammar,
	"analysis_result": analysis_result,
	"retrieved_context": reliable_context
	}

	def format_output(self, results: dict, lang: str = "en") -> str:
	"""Format the analysis results for Gradio output in the selected language."""
	if lang == "fr":
	output = "=" * 80 + "\n"
	output += "ANALYSEUR DE LANGUE ZARMA\n"
	output += "=" * 80 + "\n\n"

	output += f"Phrase Analysée: \"{results['sentence']}\"\n"
	output += f"État de la Grammaire: {'Problèmes détectés' if results['grammar_issues'] else 'Correct'}\n\n"

	output += "Analyse Détaillée:\n"
	output += "-" * 80 + "\n"
	output += results['analysis_result'] + "\n\n"

	output += "Sources de Contexte Fiables:\n"
	output += "-" * 80 + "\n"
	if results["retrieved_context"]:
	for i, context in enumerate(results["retrieved_context"], 1):
	output += f"Source {i}:\n{context}\n\n"
	else:
	output += "Aucune source de contexte fiable récupérée basée sur l'analyse.\n"
	output += "=" * 80
	else: # Default to English
	output = "=" * 80 + "\n"
	output += "ZARMA LANGUAGE ANALYZER\n"
	output += "=" * 80 + "\n\n"

	output += f"Sentence Analyzed: \"{results['sentence']}\"\n"
	output += f"Grammar Status: {'Issues detected' if results['grammar_issues'] else 'Correct'}\n\n"

	output += "Detailed Analysis:\n"
	output += "-" * 80 + "\n"
	output += results['analysis_result'] + "\n\n"

	output += "Reliable Context Sources:\n"
	output += "-" * 80 + "\n"
	if results["retrieved_context"]:
	for i, context in enumerate(results["retrieved_context"], 1):
	output += f"Source {i}:\n{context}\n\n"
	else:
	output += "No reliable context sources retrieved based on the analysis.\n"
	output += "=" * 80

	return output

	# Initialize the analyzer (adjust paths to match your Hugging Face Space structure)
	analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json")

	# Gradio interface
	def analyze_zarma_sentence(sentence, output_in_english):
	if not sentence.strip():
	return "Please enter a valid Zarma sentence." if output_in_english else "Veuillez entrer une phrase Zarma valide."
	lang = "en" if output_in_english else "fr"
	results = analyzer.analyze_sentence(sentence, lang=lang)
	return analyzer.format_output(results, lang=lang)

	# Define the Gradio UI
	with gr.Blocks(title="Zarma Language Analyzer") as demo:
	gr.Markdown("# Zarma Language Analyzer")
	gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.")

	sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.")
	language_checkbox = gr.Checkbox(label="Output in English (uncheck for French)", value=True)
	analyze_button = gr.Button("Analyze")
	output_text = gr.Textbox(label="Analysis Result", lines=20)

	analyze_button.click(
	fn=analyze_zarma_sentence,
	inputs=[sentence_input, language_checkbox],
	outputs=output_text
	)

	# Launch the app
	demo.launch(share=True)