|
|
import os |
|
|
|
|
|
|
|
|
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" |
|
|
|
|
|
import gradio as gr |
|
|
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline |
|
|
|
|
|
pipeline = SolrNormalizationPipeline() |
|
|
|
|
|
LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"] |
|
|
|
|
|
def normalize(text, lang_choice): |
|
|
try: |
|
|
lang = None if lang_choice == "Auto-detect" else lang_choice |
|
|
result = pipeline(text, lang=lang, diagnostics=True) |
|
|
|
|
|
|
|
|
analyzer_steps = [] |
|
|
if 'analyzer_pipeline' in result and result['analyzer_pipeline']: |
|
|
for i, step in enumerate(result['analyzer_pipeline'], 1): |
|
|
step_type = step.get('type', 'unknown') |
|
|
step_name = step.get('name', 'unnamed') |
|
|
analyzer_steps.append(f" {i}. {step_type}: {step_name}") |
|
|
|
|
|
analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found" |
|
|
|
|
|
return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}" |
|
|
except Exception as e: |
|
|
print("❌ Pipeline error:", e) |
|
|
return f"Error: {e}" |
|
|
|
|
|
|
|
|
examples = [ |
|
|
["The quick brown fox jumps over the lazy dog. This is a sample text for testing.", "en"], |
|
|
["Der schnelle braune Fuchs springt über den faulen Hund. Dies ist ein Beispieltext zum Testen.", "de"], |
|
|
["Le renard brun rapide saute par-dessus le chien paresseux. Ceci est un texte d'exemple pour les tests.", "fr"], |
|
|
["El zorro marrón rápido salta sobre el perro perezoso. Este es un texto de ejemplo para pruebas.", "es"], |
|
|
["La volpe marrone veloce salta sopra il cane pigro. Questo è un testo di esempio per i test.", "it"], |
|
|
["Auto-detect language: Mixed content with English and Français words together!", "Auto-detect"] |
|
|
] |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=normalize, |
|
|
inputs=[ |
|
|
gr.Textbox( |
|
|
label="Enter Text", |
|
|
placeholder="Type your text here or try one of the examples below...", |
|
|
lines=3 |
|
|
), |
|
|
gr.Dropdown(choices=["Auto-detect"] + LANGUAGES, value="Auto-detect", label="Language") |
|
|
], |
|
|
outputs=gr.Textbox(label="Normalized Output", lines=10), |
|
|
examples=examples, |
|
|
title="🔥 Solr Normalization Pipeline", |
|
|
description=""" |
|
|
<div style="text-align: center; margin-bottom: 20px;"> |
|
|
<img src="file/logo.jpeg" alt="Logo" style="max-width: 200px; height: auto; border-radius: 8px;"> |
|
|
</div> |
|
|
|
|
|
**Solr normalization is intended to give an idea of what kind of normalization is happening behind Impresso.** |
|
|
|
|
|
This demo replicates Solr's text analysis functionality, showing how text is processed through various normalization steps including tokenization, stopword removal, and language-specific analysis. |
|
|
|
|
|
Try the examples below or enter your own text to see how different languages are processed! |
|
|
""", |
|
|
article=""" |
|
|
### About |
|
|
This tool demonstrates the text normalization pipeline used in the Impresso project, which mirrors Apache Solr's text analysis capabilities. |
|
|
""", |
|
|
theme=gr.themes.Soft(), |
|
|
allow_flagging="never" |
|
|
) |
|
|
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
|