Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9 Claude Sonnet 4.6 commited on 23 days ago

Commit

cc50efb

1 Parent(s): c33a061

Align Whisper default to turbo-v3 + add document upload to Knowledge Base tab

The __main__ re-read of WHISPER_MODEL_ID still defaulted to whisper-small,
overriding the module-level default of whisper-large-v3-turbo. Align both
so the Space uses turbo-v3 consistently with the training notebook.

Add a new section to the Knowledge Base tab that accepts multiple PDF,
Word, or TXT uploads tagged to a specific language (Bambara or Fula).
Each upload:
- is parsed with pypdf / python-docx / plain text
- runs through language-specific normalisation (Adlam -> Latin for Fula,
French-influenced spellings -> standard Bambara)
- is split into 3-25-word sentences
- appends to vocabulary.jsonl in the feedback repo so the Kaggle notebook
picks it up on the next training run

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +155 -2
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ Environment variables (set in Space Settings → Secrets):
   HF_TOKEN               — HF write-access token
   FEEDBACK_REPO_ID       — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
   ADAPTER_REPO_ID        — e.g. ous-sow/sahel-agri-adapters   (model, private)
-  WHISPER_MODEL_ID       — default: openai/whisper-small
   LLM_MODEL_ID           — default: Qwen/Qwen2.5-72B-Instruct
   KAGGLE_USERNAME        — Kaggle username (for auto-trigger training)
   KAGGLE_KEY             — Kaggle API key  (for auto-trigger training)
@@ -873,6 +873,121 @@ def _import_phrase_pairs(lang_label: str, pairs_text: str) -> str:
     return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
 def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
     """Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
     if _hf_api is None or not FEEDBACK_REPO_ID:
@@ -1905,6 +2020,44 @@ def build_ui() -> gr.Blocks:
                     outputs=[yt_status],
                 )
             # ── Tab 4: Model Training ─────────────────────────────────────────
             with gr.TabItem("🔧 Model Training"):
                 gr.Markdown(
@@ -2057,7 +2210,7 @@ if __name__ == "__main__":
     HF_TOKEN         = os.environ.get("HF_TOKEN")
     FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
     ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapters")
-    WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID",  "openai/whisper-small")
     LLM_MODEL_ID     = os.environ.get("LLM_MODEL_ID",      "Qwen/Qwen2.5-7B-Instruct")
     if HF_TOKEN:

   HF_TOKEN               — HF write-access token
   FEEDBACK_REPO_ID       — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
   ADAPTER_REPO_ID        — e.g. ous-sow/sahel-agri-adapters   (model, private)
+  WHISPER_MODEL_ID       — default: openai/whisper-large-v3-turbo
   LLM_MODEL_ID           — default: Qwen/Qwen2.5-72B-Instruct
   KAGGLE_USERNAME        — Kaggle username (for auto-trigger training)
   KAGGLE_KEY             — Kaggle API key  (for auto-trigger training)
     return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
+def _extract_text_from_document(file_path: str) -> str:
+    """Extract plain text from a PDF, DOCX, or TXT file. Returns empty string on failure."""
+    ext = Path(file_path).suffix.lower()
+    try:
+        if ext == ".pdf":
+            from pypdf import PdfReader
+            reader = PdfReader(file_path)
+            return "\n".join((p.extract_text() or "") for p in reader.pages)
+        if ext in (".docx", ".doc"):
+            from docx import Document
+            doc = Document(file_path)
+            return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
+        if ext in (".txt", ".md"):
+            with open(file_path, encoding="utf-8", errors="ignore") as f:
+                return f.read()
+    except Exception as exc:
+        import logging
+        logging.getLogger(__name__).warning("Document extract failed for %s: %s", file_path, exc)
+    return ""
+def _sentences_from_text(text: str, min_words: int = 3, max_words: int = 25) -> list[str]:
+    """Split extracted text into clean sentences suitable for vocabulary.jsonl."""
+    import re as _re
+    # Normalise whitespace and split on sentence boundaries (., !, ?, or double newline)
+    text = _re.sub(r"\s+", " ", text).strip()
+    raw = _re.split(r"(?<=[.!?])\s+|\n\n+", text)
+    out = []
+    seen = set()
+    for s in raw:
+        s = s.strip(" \t\"'`—–-")
+        if not s:
+            continue
+        words = s.split()
+        if not (min_words <= len(words) <= max_words):
+            continue
+        key = s.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(s)
+    return out
+def _import_documents(lang_label: str, files: list, source_note: str) -> str:
+    """Extract sentences from uploaded PDF/Word/TXT files and append to vocabulary.jsonl."""
+    if not files:
+        return "⚠️ Please upload at least one document first."
+    lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
+    # Language normalisation — same rule as other ingestion paths
+    total_sentences = 0
+    per_file_summary = []
+    all_entries: list[dict] = []
+    for f in files:
+        # Gradio File component returns a tempfile path (or an object with .name)
+        path = f if isinstance(f, str) else getattr(f, "name", None)
+        if not path:
+            continue
+        text = _extract_text_from_document(path)
+        if not text.strip():
+            per_file_summary.append(f"  - {Path(path).name}: ⚠️ no text extracted")
+            continue
+        # Apply language-specific normalisation so Adlam → Latin etc.
+        try:
+            if lang == "ful":
+                text = normalize_pular(text)
+            elif lang == "bam":
+                text = bam_normalize(text)
+        except Exception:
+            pass
+        sentences = _sentences_from_text(text)
+        for s in sentences:
+            all_entries.append({
+                "word":        s,
+                "translation": "",
+                "language":    lang,
+                "source":      f"document: {source_note or Path(path).name}",
+            })
+        per_file_summary.append(f"  - {Path(path).name}: {len(sentences)} sentence(s)")
+        total_sentences += len(sentences)
+    if not all_entries:
+        return "⚠️ No usable sentences found in the uploaded document(s).\n" + "\n".join(per_file_summary)
+    # Append to vocabulary.jsonl on Hub (same pattern as _append_phrases_to_vocabulary_jsonl)
+    if _hf_api is not None and FEEDBACK_REPO_ID:
+        try:
+            from huggingface_hub import hf_hub_download
+            try:
+                local = hf_hub_download(
+                    repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
+                    repo_type="dataset", token=HF_TOKEN,
+                )
+                with open(local, encoding="utf-8") as f:
+                    existing = f.read()
+            except Exception:
+                existing = ""
+            new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in all_entries)
+            _hf_api.upload_file(
+                path_or_fileobj=io.BytesIO((existing + new_lines).encode("utf-8")),
+                path_in_repo="vocabulary.jsonl",
+                repo_id=FEEDBACK_REPO_ID,
+                repo_type="dataset",
+            )
+            threading.Thread(target=_refresh_vocab_context, daemon=True).start()
+        except Exception as exc:
+            return f"⚠️ Extracted {total_sentences} sentence(s) but Hub upload failed: {exc}"
+    return (
+        f"✅ Imported {total_sentences} sentence(s) for {lang_label} from {len(files)} document(s).\n"
+        + "\n".join(per_file_summary)
+        + "\n\nThese will be used by the Kaggle training notebook on the next run."
+    )
 def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
     """Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
     if _hf_api is None or not FEEDBACK_REPO_ID:
                     outputs=[yt_status],
                 )
+                # ── Document upload (PDF / Word / TXT) ───────────────────────
+                gr.Markdown("---")
+                gr.Markdown(
+                    "### 📄 Upload documents (PDF, Word, TXT)\n"
+                    "Extract sentences from books, articles, or lesson PDFs. "
+                    "Each sentence is added to the training vocabulary in the language you select below. "
+                    "**Upload one batch per language** — do not mix Bambara and Fula files in one upload."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        doc_lang = gr.Dropdown(
+                            choices=["Bambara (bam)", "Fula (ful)"],
+                            value="Fula (ful)",
+                            label="Language of these documents",
+                        )
+                        doc_files = gr.File(
+                            label="Upload .pdf, .docx, or .txt (multiple allowed)",
+                            file_count="multiple",
+                            file_types=[".pdf", ".docx", ".doc", ".txt", ".md"],
+                        )
+                        doc_source = gr.Textbox(
+                            placeholder="e.g. SIL Pular grammar book, Labé lesson PDFs",
+                            label="Source note (optional — for your records)",
+                        )
+                        doc_btn = gr.Button("📥 Extract & Add to Training Data", variant="primary")
+                    with gr.Column():
+                        doc_status = gr.Textbox(
+                            label="Import status",
+                            interactive=False,
+                            lines=12,
+                        )
+                doc_btn.click(
+                    fn=_import_documents,
+                    inputs=[doc_lang, doc_files, doc_source],
+                    outputs=[doc_status],
+                )
             # ── Tab 4: Model Training ─────────────────────────────────────────
             with gr.TabItem("🔧 Model Training"):
                 gr.Markdown(
     HF_TOKEN         = os.environ.get("HF_TOKEN")
     FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
     ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapters")
+    WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID",  "openai/whisper-large-v3-turbo")
     LLM_MODEL_ID     = os.environ.get("LLM_MODEL_ID",      "Qwen/Qwen2.5-7B-Instruct")
     if HF_TOKEN:

requirements.txt CHANGED Viewed

@@ -44,3 +44,7 @@ rapidfuzz==3.13.0
 # Kaggle API (used by Self-Teaching tab to trigger training runs)
 kaggle>=1.6.0

 # Kaggle API (used by Self-Teaching tab to trigger training runs)
 kaggle>=1.6.0
+# Document parsing for Knowledge Base tab (PDF/Word/TXT upload → vocabulary.jsonl)
+pypdf>=4.0.0
+python-docx>=1.1.0