jefffffff9 Claude Sonnet 4.6 commited on
Commit
cc50efb
·
1 Parent(s): c33a061

Align Whisper default to turbo-v3 + add document upload to Knowledge Base tab

Browse files

The __main__ re-read of WHISPER_MODEL_ID still defaulted to whisper-small,
overriding the module-level default of whisper-large-v3-turbo. Align both
so the Space uses turbo-v3 consistently with the training notebook.

Add a new section to the Knowledge Base tab that accepts multiple PDF,
Word, or TXT uploads tagged to a specific language (Bambara or Fula).
Each upload:
- is parsed with pypdf / python-docx / plain text
- runs through language-specific normalisation (Adlam -> Latin for Fula,
French-influenced spellings -> standard Bambara)
- is split into 3-25-word sentences
- appends to vocabulary.jsonl in the feedback repo so the Kaggle notebook
picks it up on the next training run

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +155 -2
  2. requirements.txt +4 -0
app.py CHANGED
@@ -6,7 +6,7 @@ Environment variables (set in Space Settings → Secrets):
6
  HF_TOKEN — HF write-access token
7
  FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
8
  ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
9
- WHISPER_MODEL_ID — default: openai/whisper-small
10
  LLM_MODEL_ID — default: Qwen/Qwen2.5-72B-Instruct
11
  KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
12
  KAGGLE_KEY — Kaggle API key (for auto-trigger training)
@@ -873,6 +873,121 @@ def _import_phrase_pairs(lang_label: str, pairs_text: str) -> str:
873
  return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
874
 
875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
877
  """Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
878
  if _hf_api is None or not FEEDBACK_REPO_ID:
@@ -1905,6 +2020,44 @@ def build_ui() -> gr.Blocks:
1905
  outputs=[yt_status],
1906
  )
1907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1908
  # ── Tab 4: Model Training ─────────────────────────────────────────
1909
  with gr.TabItem("🔧 Model Training"):
1910
  gr.Markdown(
@@ -2057,7 +2210,7 @@ if __name__ == "__main__":
2057
  HF_TOKEN = os.environ.get("HF_TOKEN")
2058
  FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
2059
  ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
2060
- WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-small")
2061
  LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
2062
 
2063
  if HF_TOKEN:
 
6
  HF_TOKEN — HF write-access token
7
  FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
8
  ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
9
+ WHISPER_MODEL_ID — default: openai/whisper-large-v3-turbo
10
  LLM_MODEL_ID — default: Qwen/Qwen2.5-72B-Instruct
11
  KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
12
  KAGGLE_KEY — Kaggle API key (for auto-trigger training)
 
873
  return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
874
 
875
 
876
+ def _extract_text_from_document(file_path: str) -> str:
877
+ """Extract plain text from a PDF, DOCX, or TXT file. Returns empty string on failure."""
878
+ ext = Path(file_path).suffix.lower()
879
+ try:
880
+ if ext == ".pdf":
881
+ from pypdf import PdfReader
882
+ reader = PdfReader(file_path)
883
+ return "\n".join((p.extract_text() or "") for p in reader.pages)
884
+ if ext in (".docx", ".doc"):
885
+ from docx import Document
886
+ doc = Document(file_path)
887
+ return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
888
+ if ext in (".txt", ".md"):
889
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
890
+ return f.read()
891
+ except Exception as exc:
892
+ import logging
893
+ logging.getLogger(__name__).warning("Document extract failed for %s: %s", file_path, exc)
894
+ return ""
895
+
896
+
897
+ def _sentences_from_text(text: str, min_words: int = 3, max_words: int = 25) -> list[str]:
898
+ """Split extracted text into clean sentences suitable for vocabulary.jsonl."""
899
+ import re as _re
900
+ # Normalise whitespace and split on sentence boundaries (., !, ?, or double newline)
901
+ text = _re.sub(r"\s+", " ", text).strip()
902
+ raw = _re.split(r"(?<=[.!?])\s+|\n\n+", text)
903
+ out = []
904
+ seen = set()
905
+ for s in raw:
906
+ s = s.strip(" \t\"'`—–-")
907
+ if not s:
908
+ continue
909
+ words = s.split()
910
+ if not (min_words <= len(words) <= max_words):
911
+ continue
912
+ key = s.lower()
913
+ if key in seen:
914
+ continue
915
+ seen.add(key)
916
+ out.append(s)
917
+ return out
918
+
919
+
920
+ def _import_documents(lang_label: str, files: list, source_note: str) -> str:
921
+ """Extract sentences from uploaded PDF/Word/TXT files and append to vocabulary.jsonl."""
922
+ if not files:
923
+ return "⚠️ Please upload at least one document first."
924
+ lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
925
+ # Language normalisation — same rule as other ingestion paths
926
+ total_sentences = 0
927
+ per_file_summary = []
928
+ all_entries: list[dict] = []
929
+ for f in files:
930
+ # Gradio File component returns a tempfile path (or an object with .name)
931
+ path = f if isinstance(f, str) else getattr(f, "name", None)
932
+ if not path:
933
+ continue
934
+ text = _extract_text_from_document(path)
935
+ if not text.strip():
936
+ per_file_summary.append(f" - {Path(path).name}: ⚠️ no text extracted")
937
+ continue
938
+ # Apply language-specific normalisation so Adlam → Latin etc.
939
+ try:
940
+ if lang == "ful":
941
+ text = normalize_pular(text)
942
+ elif lang == "bam":
943
+ text = bam_normalize(text)
944
+ except Exception:
945
+ pass
946
+ sentences = _sentences_from_text(text)
947
+ for s in sentences:
948
+ all_entries.append({
949
+ "word": s,
950
+ "translation": "",
951
+ "language": lang,
952
+ "source": f"document: {source_note or Path(path).name}",
953
+ })
954
+ per_file_summary.append(f" - {Path(path).name}: {len(sentences)} sentence(s)")
955
+ total_sentences += len(sentences)
956
+
957
+ if not all_entries:
958
+ return "⚠️ No usable sentences found in the uploaded document(s).\n" + "\n".join(per_file_summary)
959
+
960
+ # Append to vocabulary.jsonl on Hub (same pattern as _append_phrases_to_vocabulary_jsonl)
961
+ if _hf_api is not None and FEEDBACK_REPO_ID:
962
+ try:
963
+ from huggingface_hub import hf_hub_download
964
+ try:
965
+ local = hf_hub_download(
966
+ repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
967
+ repo_type="dataset", token=HF_TOKEN,
968
+ )
969
+ with open(local, encoding="utf-8") as f:
970
+ existing = f.read()
971
+ except Exception:
972
+ existing = ""
973
+ new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in all_entries)
974
+ _hf_api.upload_file(
975
+ path_or_fileobj=io.BytesIO((existing + new_lines).encode("utf-8")),
976
+ path_in_repo="vocabulary.jsonl",
977
+ repo_id=FEEDBACK_REPO_ID,
978
+ repo_type="dataset",
979
+ )
980
+ threading.Thread(target=_refresh_vocab_context, daemon=True).start()
981
+ except Exception as exc:
982
+ return f"⚠️ Extracted {total_sentences} sentence(s) but Hub upload failed: {exc}"
983
+
984
+ return (
985
+ f"✅ Imported {total_sentences} sentence(s) for {lang_label} from {len(files)} document(s).\n"
986
+ + "\n".join(per_file_summary)
987
+ + "\n\nThese will be used by the Kaggle training notebook on the next run."
988
+ )
989
+
990
+
991
  def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
992
  """Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
993
  if _hf_api is None or not FEEDBACK_REPO_ID:
 
2020
  outputs=[yt_status],
2021
  )
2022
 
2023
+ # ── Document upload (PDF / Word / TXT) ───────────────────────
2024
+ gr.Markdown("---")
2025
+ gr.Markdown(
2026
+ "### 📄 Upload documents (PDF, Word, TXT)\n"
2027
+ "Extract sentences from books, articles, or lesson PDFs. "
2028
+ "Each sentence is added to the training vocabulary in the language you select below. "
2029
+ "**Upload one batch per language** — do not mix Bambara and Fula files in one upload."
2030
+ )
2031
+ with gr.Row():
2032
+ with gr.Column():
2033
+ doc_lang = gr.Dropdown(
2034
+ choices=["Bambara (bam)", "Fula (ful)"],
2035
+ value="Fula (ful)",
2036
+ label="Language of these documents",
2037
+ )
2038
+ doc_files = gr.File(
2039
+ label="Upload .pdf, .docx, or .txt (multiple allowed)",
2040
+ file_count="multiple",
2041
+ file_types=[".pdf", ".docx", ".doc", ".txt", ".md"],
2042
+ )
2043
+ doc_source = gr.Textbox(
2044
+ placeholder="e.g. SIL Pular grammar book, Labé lesson PDFs",
2045
+ label="Source note (optional — for your records)",
2046
+ )
2047
+ doc_btn = gr.Button("📥 Extract & Add to Training Data", variant="primary")
2048
+ with gr.Column():
2049
+ doc_status = gr.Textbox(
2050
+ label="Import status",
2051
+ interactive=False,
2052
+ lines=12,
2053
+ )
2054
+
2055
+ doc_btn.click(
2056
+ fn=_import_documents,
2057
+ inputs=[doc_lang, doc_files, doc_source],
2058
+ outputs=[doc_status],
2059
+ )
2060
+
2061
  # ── Tab 4: Model Training ─────────────────────────────────────────
2062
  with gr.TabItem("🔧 Model Training"):
2063
  gr.Markdown(
 
2210
  HF_TOKEN = os.environ.get("HF_TOKEN")
2211
  FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
2212
  ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
2213
+ WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-large-v3-turbo")
2214
  LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
2215
 
2216
  if HF_TOKEN:
requirements.txt CHANGED
@@ -44,3 +44,7 @@ rapidfuzz==3.13.0
44
 
45
  # Kaggle API (used by Self-Teaching tab to trigger training runs)
46
  kaggle>=1.6.0
 
 
 
 
 
44
 
45
  # Kaggle API (used by Self-Teaching tab to trigger training runs)
46
  kaggle>=1.6.0
47
+
48
+ # Document parsing for Knowledge Base tab (PDF/Word/TXT upload → vocabulary.jsonl)
49
+ pypdf>=4.0.0
50
+ python-docx>=1.1.0