File size: 3,417 Bytes
327bd85
 
 
 
 
b09d94b
 
42c4e1a
 
 
8c7a402
42c4e1a
 
e36aaa8
 
 
527919e
 
 
 
 
 
 
 
 
 
 
 
e36aaa8
 
 
 
cb3e34f
 
 
 
 
 
 
 
 
 
42c4e1a
 
 
cb3e34f
 
 
 
 
 
42c4e1a
cb3e34f
 
 
 
d7a5646
 
 
 
cb3e34f
 
 
 
 
 
 
 
 
 
 
93c2b81
42c4e1a
 
b09d94b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os

# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

import gradio as gr
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline

pipeline = SolrNormalizationPipeline()

LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]

def normalize(text, lang_choice):
    try:
        lang = None if lang_choice == "Auto-detect" else lang_choice
        result = pipeline(text, lang=lang, diagnostics=True)
        
        # Format analyzer pipeline for better readability
        analyzer_steps = []
        if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
            for i, step in enumerate(result['analyzer_pipeline'], 1):
                step_type = step.get('type', 'unknown')
                step_name = step.get('name', 'unnamed')
                analyzer_steps.append(f"  {i}. {step_type}: {step_name}")
        
        analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else "  No analyzer steps found"
        
        return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}"
    except Exception as e:
        print("❌ Pipeline error:", e)
        return f"Error: {e}"

# Define example inputs for different languages
examples = [
    ["The quick brown fox jumps over the lazy dog. This is a sample text for testing.", "en"],
    ["Der schnelle braune Fuchs springt über den faulen Hund. Dies ist ein Beispieltext zum Testen.", "de"],
    ["Le renard brun rapide saute par-dessus le chien paresseux. Ceci est un texte d'exemple pour les tests.", "fr"],
    ["El zorro marrón rápido salta sobre el perro perezoso. Este es un texto de ejemplo para pruebas.", "es"],
    ["La volpe marrone veloce salta sopra il cane pigro. Questo è un testo di esempio per i test.", "it"],
    ["Auto-detect language: Mixed content with English and Français words together!", "Auto-detect"]
]

demo = gr.Interface(
    fn=normalize,
    inputs=[
        gr.Textbox(
            label="Enter Text",
            placeholder="Type your text here or try one of the examples below...",
            lines=3
        ),
        gr.Dropdown(choices=["Auto-detect"] + LANGUAGES, value="Auto-detect", label="Language")
    ],
    outputs=gr.Textbox(label="Normalized Output", lines=10),
    examples=examples,
    title="🔥 Solr Normalization Pipeline",
    description="""
    <div style="text-align: center; margin-bottom: 20px;">
        <img src="file/logo.jpeg" alt="Logo" style="max-width: 200px; height: auto; border-radius: 8px;">
    </div>
    
    **Solr normalization is intended to give an idea of what kind of normalization is happening behind Impresso.**
    
    This demo replicates Solr's text analysis functionality, showing how text is processed through various normalization steps including tokenization, stopword removal, and language-specific analysis.
    
    Try the examples below or enter your own text to see how different languages are processed!
    """,
    article="""
    ### About
    This tool demonstrates the text normalization pipeline used in the Impresso project, which mirrors Apache Solr's text analysis capabilities.
    """,
    theme=gr.themes.Soft(),
    allow_flagging="never"
)

demo.launch(server_name="0.0.0.0", server_port=7860)