Spaces:
Sleeping
Sleeping
| """ | |
| app.py | |
| ββββββ | |
| Interfaz web Gradio para el sistema RAG de correcciΓ³n de castellano s.XVI. | |
| Arranque: | |
| python app.py | |
| Requiere: | |
| - .env con OPENAI_API_KEY | |
| - (opcional) corpus en ./corpus/ | |
| """ | |
| import os | |
| import json | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| from knowledge_base import SAMPLE_PAIRS | |
| from corpus_loader import CorpusLoader | |
| from vector_store import VectorStore | |
| from rag_corrector import RAGCorrector | |
| from evaluator import Evaluator | |
| load_dotenv() | |
| # ββ InicializaciΓ³n ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(" Inicializando Scriptorium RAG...") | |
| vs = VectorStore() | |
| # Cargar corpus desde disco (si existe) + pares de ejemplo embebidos | |
| loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus")) | |
| disk_pairs = loader.load() | |
| all_pairs = SAMPLE_PAIRS + disk_pairs | |
| # Indexar todo en ChromaDB | |
| vs.index(all_pairs) | |
| corrector = RAGCorrector(vs) | |
| evaluator = Evaluator() | |
| print(f" Sistema listo. Documentos en vector store: {vs.count()}") | |
| # ββ Ejemplos de demostraciΓ³n ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DEMO_EXAMPLES = [ | |
| "qΜ fizo merΓ§ed al dho lugar de las alcaualas del anno de mill e quinientos", | |
| "el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho", | |
| "en la muy noble Γ§ibdad de burgos a veynte dias del mes de marΓ§o anno dho", | |
| "yo juan de la torre vezino desta uilla de toledo otorgo e conosco", | |
| "sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa", | |
| "fizo pareΓ§er ante si a los testigos qΜ dixeron ser mayores de veynte annos", | |
| ] | |
| # Variable global para el vector store activo | |
| current_embed_model = "openai" | |
| vs = VectorStore(embedding_model="openai") | |
| vs.index(all_pairs) | |
| def cambiar_embedding(embed_model: str): | |
| global vs, corrector, current_embed_model | |
| if embed_model == current_embed_model: | |
| return f"βΉ Ya estΓ‘s usando **{embed_model}**" | |
| try: | |
| current_embed_model = embed_model | |
| vs = VectorStore(embedding_model=embed_model) | |
| # Indexar si la colecciΓ³n estΓ‘ vacΓa | |
| if vs.count() == 0: | |
| vs.index(all_pairs) | |
| msg = f" Re-indexado con **{embed_model}** Β· {vs.count()} docs" | |
| else: | |
| msg = f" Cargado Γndice existente **{embed_model}** Β· {vs.count()} docs" | |
| # Recrear el corrector con el nuevo vector store | |
| corrector = RAGCorrector(vs) | |
| return msg | |
| except Exception as e: | |
| return f" Error cambiando embedding: {e}" | |
| # ββ FunciΓ³n principal βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str): | |
| if not htr_text.strip(): | |
| return "", "", "", "", " Introduce un texto HTR para corregir." | |
| if not os.getenv("OPENAI_API_KEY"): | |
| return "", "", "", "", " Falta OPENAI_API_KEY en el fichero .env" | |
| try: | |
| result = corrector.correct(htr_text, top_k=int(top_k), model= model) | |
| except Exception as e: | |
| return "", "", "", "", f" Error al llamar a la API: {e}" | |
| corrected = result["corrected"] | |
| retrieved = result["retrieved"] | |
| htr_errors = result["htr_errors"] | |
| grafia_w = result["grafia_warns"] | |
| # ββ Panel de documentos recuperados ββββββββββββββββββββββββββββββββββββββ | |
| docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n" | |
| for i, doc in enumerate(retrieved, 1): | |
| docs_md += ( | |
| f"**{i}. [{doc['type']} Β· {doc['region']} Β· {doc['date']}]** " | |
| f"*similitud: {doc['score']}*\n\n" | |
| f"- **HTR:** `{doc['htr']}`\n" | |
| f"- **GT:** `{doc['gt']}`\n" | |
| ) | |
| if doc["corrections"]: | |
| docs_md += f"- **Correcciones:** {', '.join(doc['corrections'])}\n" | |
| docs_md += "\n---\n" | |
| # ββ Panel de anΓ‘lisis βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| analysis_md = "### AnΓ‘lisis del texto\n\n" | |
| if htr_errors: | |
| analysis_md += "**β Posibles errores HTR detectados:**\n" | |
| for e in htr_errors: | |
| analysis_md += f"- `{e['htr']}` β `{e['gt']}`: {e['context']} \n *Ej: {e['example']}*\n" | |
| analysis_md += "\n" | |
| if grafia_w: | |
| analysis_md += "**β¦ Alertas de grafΓa (NO modernizar):**\n" | |
| for g in grafia_w: | |
| analysis_md += f"- `{g['modern']}` β mantener `{g['ancient']}`: {g['rule']}\n" | |
| analysis_md += "\n" | |
| if not htr_errors and not grafia_w: | |
| analysis_md += "*No se detectaron patrones conocidos de error en el texto.*\n" | |
| # Diff visual (diferencias) | |
| diff_md = "### Diferencias HTR β Corregido\n\n" | |
| orig_words = htr_text.split() | |
| corr_words = corrected.split() | |
| diff_parts = [] | |
| max_len = max(len(orig_words), len(corr_words)) | |
| changed = 0 | |
| for i in range(max_len): | |
| o = orig_words[i] if i < len(orig_words) else "β" | |
| c = corr_words[i] if i < len(corr_words) else "β" | |
| if o != c: | |
| diff_parts.append(f"~~{o}~~ β **{c}**") | |
| changed += 1 | |
| else: | |
| diff_parts.append(c) | |
| diff_md += " ".join(diff_parts) | |
| diff_md += f"\n\n*{changed} palabra(s) modificada(s) de {len(orig_words)} totales.*" | |
| # ββ Prompt (opcional) βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| #prompt_md = "" | |
| #if mostrar_prompt: | |
| # prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```" | |
| status = f" CorrecciΓ³n completada con **{result['model']}** Β· {vs.count()} docs en Γndice" | |
| prompt_visible = "" | |
| if mostrar_prompt: | |
| prompt_visible = ( | |
| "### System Prompt\n\n" | |
| f"```\n{result.get('_system', '(ver rag_corrector.py)')}\n```\n\n" | |
| "### User Prompt (dinΓ‘mico)\n\n" | |
| f"```\n{result['prompt']}\n```" | |
| ) | |
| return corrected, docs_md, analysis_md, diff_md, status, prompt_visible | |
| def evaluar_par(htr_text: str, gt_text: str): | |
| if not htr_text.strip() or not gt_text.strip(): | |
| return "β Introduce tanto el texto HTR como el groundtruth." | |
| try: | |
| result = corrector.correct(htr_text) | |
| metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text) | |
| m = metrics | |
| mod = m["modernism"] | |
| report = ( | |
| f"### MΓ©tricas de evaluaciΓ³n\n\n" | |
| f"| MΓ©trica | Antes (HTR) | DespuΓ©s (RAG) | Mejora |\n" | |
| f"|---------|------------|---------------|--------|\n" | |
| f"| **CER** | {m['cer_before']:.2%} | {m['cer_after']:.2%} | {m['cer_improvement']:+.2%} |\n" | |
| f"| **WER** | {m['wer_before']:.2%} | {m['wer_after']:.2%} | {m['wer_improvement']:+.2%} |\n\n" | |
| f"**Detector de modernismos:** score={mod['score']:.2f} " | |
| f"({mod['count']} problema(s) detectado(s))\n" | |
| ) | |
| if mod["issues"]: | |
| report += "\nFormas modernas introducidas incorrectamente:\n" | |
| for iss in mod["issues"]: | |
| report += f"- `{iss['modern']}` (deberΓa ser `{iss['ancient']}`): {iss['rule']}\n" | |
| report += f"\n**Texto corregido por RAG:**\n> {result['corrected']}" | |
| return report | |
| except Exception as e: | |
| return f" Error: {e}" | |
| def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str, caligrafia: str): | |
| if not htr_text.strip() or not gt_text.strip(): | |
| return "β HTR y GT son obligatorios." | |
| try: | |
| pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}" | |
| new_pair = { | |
| "id": pair_id, | |
| "htr": htr_text.strip(), | |
| "gt": gt_text.strip(), | |
| "type": doc_type or "desconocido", | |
| "region": region or "desconocida", | |
| "date": date or "", | |
| "caligrafia": caligrafia or "desconocida", | |
| "corrections": [], | |
| "source": "user_added", | |
| } | |
| added = vs.index([new_pair]) | |
| if added: | |
| return f" Par aΓ±adido al corpus con id `{pair_id}`. Total: {vs.count()} docs." | |
| else: | |
| return f" Par ya existΓa en el corpus (id: `{pair_id}`)." | |
| except Exception as e: | |
| return f" Error: {e}" | |
| # ββ Interfaz Gradio βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks( | |
| title="Scriptorium RAG", | |
| theme=gr.themes.Base( | |
| primary_hue="amber", | |
| secondary_hue="stone", | |
| neutral_hue="stone", | |
| font=gr.themes.GoogleFont("IM Fell English"), | |
| ), | |
| css=""" | |
| .header { text-align: center; padding: 20px 0 10px; } | |
| .header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; } | |
| .header p { color: #78716c; font-style: italic; } | |
| .status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; } | |
| """, | |
| ) as demo: | |
| # ββ Header ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>RAG CODEX for Historical Spanish</h1> | |
| <p>RAG system of Spanish correction from the 16th century</p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ββ PestaΓ±a 1: CorrecciΓ³n βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem(" HTR Correction"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| htr_input = gr.Textbox( | |
| label="HTR text (recognizer input)", | |
| placeholder="Paste the HTR result hereβ¦", | |
| lines=6, | |
| ) | |
| with gr.Row(): | |
| top_k_slider = gr.Slider( | |
| minimum=1, maximum=10, value=5, step=1, | |
| label="Documents retrieved (k)", | |
| ) | |
| model_selector = gr.Dropdown( | |
| label="Modelo LLM", | |
| choices=[ | |
| "llama-3.3-70b-versatile", | |
| "openai/gpt-oss-120b", | |
| ], | |
| value="llama-3.3-70b-versatile", | |
| ) | |
| embedding_selector = gr.Dropdown( | |
| label="Modelo de Embedding", | |
| choices=[ | |
| "openai", # text-embedding-3-small | |
| "mpnet", # paraphrase-multilingual-mpnet-base-v2 | |
| "mt5-base fine-tuned", | |
| ], | |
| value="openai", | |
| ) | |
| show_prompt = gr.Checkbox(label="Show RAG prompt", value=False) | |
| btn_corregir = gr.Button("β¦ Correct with RAG", variant="primary") | |
| gr.Examples( | |
| examples=DEMO_EXAMPLES, | |
| inputs=htr_input, | |
| label="Demonstration examples", | |
| ) | |
| with gr.Column(scale=2): | |
| corrected_out = gr.Textbox( | |
| label="Corrected text (RAG output)", | |
| lines=6, | |
| interactive=False, | |
| ) | |
| status_out = gr.Markdown(elem_classes=["status-bar"]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| docs_out = gr.Markdown(label="Documents recovered from the corpus") | |
| with gr.Column(): | |
| analysis_out = gr.Markdown(label="Pattern analysis") | |
| diff_out = gr.Markdown(label="Word-by-word differences") | |
| prompt_out = gr.Markdown(label="Prompt sent to the LLM", visible=True) | |
| btn_corregir.click( | |
| fn=corregir, | |
| inputs=[htr_input, top_k_slider, show_prompt, model_selector], | |
| outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out], | |
| ) | |
| embed_status = gr.Markdown() | |
| embedding_selector.change( | |
| fn=cambiar_embedding, | |
| inputs=[embedding_selector], | |
| outputs=[embed_status], | |
| ) | |
| # ββ PestaΓ±a 2: EvaluaciΓ³n βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem(" Evaluation with GT"): | |
| gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.") | |
| with gr.Row(): | |
| eval_htr = gr.Textbox(label="HTR text", lines=4) | |
| eval_gt = gr.Textbox(label="Groundtruth (reference)", lines=4) | |
| btn_eval = gr.Button("Evaluate", variant="primary") | |
| eval_out = gr.Markdown() | |
| btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out) | |
| # ββ PestaΓ±a 3: AΓ±adir al corpus βββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("β Add to corpus"): | |
| gr.Markdown("Add new pairs to the vector store to improve the RAG continuously.") | |
| with gr.Row(): | |
| add_htr = gr.Textbox(label="Texto HTR", lines=4) | |
| add_gt = gr.Textbox(label="Groundtruth corregido", lines=4) | |
| with gr.Row(): | |
| add_type = gr.Textbox(label="Document type", placeholder="notarial / judicial / eclesiastico") | |
| add_region = gr.Textbox(label="Region", placeholder="Castilla, AndalucΓaβ¦") | |
| add_date = gr.Textbox(label="Date", placeholder="1542") | |
| add_caligrafia = gr.Dropdown( | |
| label="CaligrafΓa", | |
| choices=["desconocida", "procesal", "encadenada", "italica"], | |
| value="desconocida", | |
| ) | |
| btn_add = gr.Button("Add to corpus", variant="primary") | |
| add_out = gr.Markdown() | |
| btn_add.click( | |
| fn=add_to_corpus, | |
| inputs=[add_htr, add_gt, add_type, add_region, add_date, add_caligrafia], | |
| outputs=add_out, | |
| ) | |
| # ββ PestaΓ±a 4: Info del sistema βββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("βΉ System"): | |
| gr.Markdown(f""" | |
| ## System status | |
| - **Modelo LLM:** {os.getenv('OPENAI_MODEL', 'gpt-4o')} | |
| - **Vector store:** ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`) | |
| - **Documentos indexados:** {vs.count()} | |
| - **Corpus cargado desde disco:** {len(disk_pairs)} pares | |
| - **Pares de ejemplo embebidos:** {len(SAMPLE_PAIRS)} | |
| ## Arquitectura | |
| ``` | |
| Texto HTR | |
| β | |
| βββΊ Detector de patrones HTR (knowledge_base.py) | |
| βββΊ Detector de grafΓas modernas (knowledge_base.py) | |
| β | |
| βββΊ Embedding (text-embedding-3-small) | |
| β β | |
| β βββΊ BΓΊsqueda top-k en ChromaDB βββΊ Few-shot dinΓ‘mico | |
| β | |
| βββΊ Prompt constructor βββΊ GPT-4o βββΊ Texto corregido | |
| ``` | |
| ## Formato del corpus | |
| Para aΓ±adir tu corpus, crea `./corpus/` con ficheros JSON: | |
| ```json | |
| [ | |
| {{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...", | |
| "type": "notarial", "region": "Castilla", "date": "1542"}}, | |
| ... | |
| ] | |
| ``` | |
| O CSV con columnas: `id, htr, gt, type, region, date` | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| auth=("admin", "admin"), # β autenticaciΓ³n bΓ‘sica (opcional) | |
| share=False, | |
| show_error=True, | |
| ) | |