Spaces:

impresso-project
/

solr-normalization-demo

Sleeping

maslionok

change

d7a5646 3 months ago

3.42 kB

	import os

	# Redirect cache to a writable path inside container
	os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

	import gradio as gr
	from impresso_pipelines.solrnormalization import SolrNormalizationPipeline

	pipeline = SolrNormalizationPipeline()

	LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]

	def normalize(text, lang_choice):
	try:
	lang = None if lang_choice == "Auto-detect" else lang_choice
	result = pipeline(text, lang=lang, diagnostics=True)

	# Format analyzer pipeline for better readability
	analyzer_steps = []
	if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
	for i, step in enumerate(result['analyzer_pipeline'], 1):
	step_type = step.get('type', 'unknown')
	step_name = step.get('name', 'unnamed')
	analyzer_steps.append(f" {i}. {step_type}: {step_name}")

	analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found"

	return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}"
	except Exception as e:
	print("❌ Pipeline error:", e)
	return f"Error: {e}"

	# Define example inputs for different languages
	examples = [
	["The quick brown fox jumps over the lazy dog. This is a sample text for testing.", "en"],
	["Der schnelle braune Fuchs springt über den faulen Hund. Dies ist ein Beispieltext zum Testen.", "de"],
	["Le renard brun rapide saute par-dessus le chien paresseux. Ceci est un texte d'exemple pour les tests.", "fr"],
	["El zorro marrón rápido salta sobre el perro perezoso. Este es un texto de ejemplo para pruebas.", "es"],
	["La volpe marrone veloce salta sopra il cane pigro. Questo è un testo di esempio per i test.", "it"],
	["Auto-detect language: Mixed content with English and Français words together!", "Auto-detect"]
	]

	demo = gr.Interface(
	fn=normalize,
	inputs=[
	gr.Textbox(
	label="Enter Text",
	placeholder="Type your text here or try one of the examples below...",
	lines=3
	),
	gr.Dropdown(choices=["Auto-detect"] + LANGUAGES, value="Auto-detect", label="Language")
	],
	outputs=gr.Textbox(label="Normalized Output", lines=10),
	examples=examples,
	title="🔥 Solr Normalization Pipeline",
	description="""
	<div style="text-align: center; margin-bottom: 20px;">
	<img src="file/logo.jpeg" alt="Logo" style="max-width: 200px; height: auto; border-radius: 8px;">
	</div>

	Solr normalization is intended to give an idea of what kind of normalization is happening behind Impresso.

	This demo replicates Solr's text analysis functionality, showing how text is processed through various normalization steps including tokenization, stopword removal, and language-specific analysis.

	Try the examples below or enter your own text to see how different languages are processed!
	""",
	article="""
	### About
	This tool demonstrates the text normalization pipeline used in the Impresso project, which mirrors Apache Solr's text analysis capabilities.
	""",
	theme=gr.themes.Soft(),
	allow_flagging="never"
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)