Spaces:
Sleeping
Align Whisper default to turbo-v3 + add document upload to Knowledge Base tab
Browse filesThe __main__ re-read of WHISPER_MODEL_ID still defaulted to whisper-small,
overriding the module-level default of whisper-large-v3-turbo. Align both
so the Space uses turbo-v3 consistently with the training notebook.
Add a new section to the Knowledge Base tab that accepts multiple PDF,
Word, or TXT uploads tagged to a specific language (Bambara or Fula).
Each upload:
- is parsed with pypdf / python-docx / plain text
- runs through language-specific normalisation (Adlam -> Latin for Fula,
French-influenced spellings -> standard Bambara)
- is split into 3-25-word sentences
- appends to vocabulary.jsonl in the feedback repo so the Kaggle notebook
picks it up on the next training run
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app.py +155 -2
- requirements.txt +4 -0
|
@@ -6,7 +6,7 @@ Environment variables (set in Space Settings → Secrets):
|
|
| 6 |
HF_TOKEN — HF write-access token
|
| 7 |
FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
|
| 8 |
ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
|
| 9 |
-
WHISPER_MODEL_ID — default: openai/whisper-
|
| 10 |
LLM_MODEL_ID — default: Qwen/Qwen2.5-72B-Instruct
|
| 11 |
KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
|
| 12 |
KAGGLE_KEY — Kaggle API key (for auto-trigger training)
|
|
@@ -873,6 +873,121 @@ def _import_phrase_pairs(lang_label: str, pairs_text: str) -> str:
|
|
| 873 |
return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
|
| 874 |
|
| 875 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
|
| 877 |
"""Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
|
| 878 |
if _hf_api is None or not FEEDBACK_REPO_ID:
|
|
@@ -1905,6 +2020,44 @@ def build_ui() -> gr.Blocks:
|
|
| 1905 |
outputs=[yt_status],
|
| 1906 |
)
|
| 1907 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1908 |
# ── Tab 4: Model Training ─────────────────────────────────────────
|
| 1909 |
with gr.TabItem("🔧 Model Training"):
|
| 1910 |
gr.Markdown(
|
|
@@ -2057,7 +2210,7 @@ if __name__ == "__main__":
|
|
| 2057 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 2058 |
FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
|
| 2059 |
ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
|
| 2060 |
-
WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-
|
| 2061 |
LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
|
| 2062 |
|
| 2063 |
if HF_TOKEN:
|
|
|
|
| 6 |
HF_TOKEN — HF write-access token
|
| 7 |
FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
|
| 8 |
ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
|
| 9 |
+
WHISPER_MODEL_ID — default: openai/whisper-large-v3-turbo
|
| 10 |
LLM_MODEL_ID — default: Qwen/Qwen2.5-72B-Instruct
|
| 11 |
KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
|
| 12 |
KAGGLE_KEY — Kaggle API key (for auto-trigger training)
|
|
|
|
| 873 |
return f"✅ Added {count} phrase(s) for {lang_label}. Library now has {total} phrases. Available immediately."
|
| 874 |
|
| 875 |
|
| 876 |
+
def _extract_text_from_document(file_path: str) -> str:
|
| 877 |
+
"""Extract plain text from a PDF, DOCX, or TXT file. Returns empty string on failure."""
|
| 878 |
+
ext = Path(file_path).suffix.lower()
|
| 879 |
+
try:
|
| 880 |
+
if ext == ".pdf":
|
| 881 |
+
from pypdf import PdfReader
|
| 882 |
+
reader = PdfReader(file_path)
|
| 883 |
+
return "\n".join((p.extract_text() or "") for p in reader.pages)
|
| 884 |
+
if ext in (".docx", ".doc"):
|
| 885 |
+
from docx import Document
|
| 886 |
+
doc = Document(file_path)
|
| 887 |
+
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
| 888 |
+
if ext in (".txt", ".md"):
|
| 889 |
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
| 890 |
+
return f.read()
|
| 891 |
+
except Exception as exc:
|
| 892 |
+
import logging
|
| 893 |
+
logging.getLogger(__name__).warning("Document extract failed for %s: %s", file_path, exc)
|
| 894 |
+
return ""
|
| 895 |
+
|
| 896 |
+
|
| 897 |
+
def _sentences_from_text(text: str, min_words: int = 3, max_words: int = 25) -> list[str]:
|
| 898 |
+
"""Split extracted text into clean sentences suitable for vocabulary.jsonl."""
|
| 899 |
+
import re as _re
|
| 900 |
+
# Normalise whitespace and split on sentence boundaries (., !, ?, or double newline)
|
| 901 |
+
text = _re.sub(r"\s+", " ", text).strip()
|
| 902 |
+
raw = _re.split(r"(?<=[.!?])\s+|\n\n+", text)
|
| 903 |
+
out = []
|
| 904 |
+
seen = set()
|
| 905 |
+
for s in raw:
|
| 906 |
+
s = s.strip(" \t\"'`—–-")
|
| 907 |
+
if not s:
|
| 908 |
+
continue
|
| 909 |
+
words = s.split()
|
| 910 |
+
if not (min_words <= len(words) <= max_words):
|
| 911 |
+
continue
|
| 912 |
+
key = s.lower()
|
| 913 |
+
if key in seen:
|
| 914 |
+
continue
|
| 915 |
+
seen.add(key)
|
| 916 |
+
out.append(s)
|
| 917 |
+
return out
|
| 918 |
+
|
| 919 |
+
|
| 920 |
+
def _import_documents(lang_label: str, files: list, source_note: str) -> str:
|
| 921 |
+
"""Extract sentences from uploaded PDF/Word/TXT files and append to vocabulary.jsonl."""
|
| 922 |
+
if not files:
|
| 923 |
+
return "⚠️ Please upload at least one document first."
|
| 924 |
+
lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
|
| 925 |
+
# Language normalisation — same rule as other ingestion paths
|
| 926 |
+
total_sentences = 0
|
| 927 |
+
per_file_summary = []
|
| 928 |
+
all_entries: list[dict] = []
|
| 929 |
+
for f in files:
|
| 930 |
+
# Gradio File component returns a tempfile path (or an object with .name)
|
| 931 |
+
path = f if isinstance(f, str) else getattr(f, "name", None)
|
| 932 |
+
if not path:
|
| 933 |
+
continue
|
| 934 |
+
text = _extract_text_from_document(path)
|
| 935 |
+
if not text.strip():
|
| 936 |
+
per_file_summary.append(f" - {Path(path).name}: ⚠️ no text extracted")
|
| 937 |
+
continue
|
| 938 |
+
# Apply language-specific normalisation so Adlam → Latin etc.
|
| 939 |
+
try:
|
| 940 |
+
if lang == "ful":
|
| 941 |
+
text = normalize_pular(text)
|
| 942 |
+
elif lang == "bam":
|
| 943 |
+
text = bam_normalize(text)
|
| 944 |
+
except Exception:
|
| 945 |
+
pass
|
| 946 |
+
sentences = _sentences_from_text(text)
|
| 947 |
+
for s in sentences:
|
| 948 |
+
all_entries.append({
|
| 949 |
+
"word": s,
|
| 950 |
+
"translation": "",
|
| 951 |
+
"language": lang,
|
| 952 |
+
"source": f"document: {source_note or Path(path).name}",
|
| 953 |
+
})
|
| 954 |
+
per_file_summary.append(f" - {Path(path).name}: {len(sentences)} sentence(s)")
|
| 955 |
+
total_sentences += len(sentences)
|
| 956 |
+
|
| 957 |
+
if not all_entries:
|
| 958 |
+
return "⚠️ No usable sentences found in the uploaded document(s).\n" + "\n".join(per_file_summary)
|
| 959 |
+
|
| 960 |
+
# Append to vocabulary.jsonl on Hub (same pattern as _append_phrases_to_vocabulary_jsonl)
|
| 961 |
+
if _hf_api is not None and FEEDBACK_REPO_ID:
|
| 962 |
+
try:
|
| 963 |
+
from huggingface_hub import hf_hub_download
|
| 964 |
+
try:
|
| 965 |
+
local = hf_hub_download(
|
| 966 |
+
repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
|
| 967 |
+
repo_type="dataset", token=HF_TOKEN,
|
| 968 |
+
)
|
| 969 |
+
with open(local, encoding="utf-8") as f:
|
| 970 |
+
existing = f.read()
|
| 971 |
+
except Exception:
|
| 972 |
+
existing = ""
|
| 973 |
+
new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in all_entries)
|
| 974 |
+
_hf_api.upload_file(
|
| 975 |
+
path_or_fileobj=io.BytesIO((existing + new_lines).encode("utf-8")),
|
| 976 |
+
path_in_repo="vocabulary.jsonl",
|
| 977 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 978 |
+
repo_type="dataset",
|
| 979 |
+
)
|
| 980 |
+
threading.Thread(target=_refresh_vocab_context, daemon=True).start()
|
| 981 |
+
except Exception as exc:
|
| 982 |
+
return f"⚠️ Extracted {total_sentences} sentence(s) but Hub upload failed: {exc}"
|
| 983 |
+
|
| 984 |
+
return (
|
| 985 |
+
f"✅ Imported {total_sentences} sentence(s) for {lang_label} from {len(files)} document(s).\n"
|
| 986 |
+
+ "\n".join(per_file_summary)
|
| 987 |
+
+ "\n\nThese will be used by the Kaggle training notebook on the next run."
|
| 988 |
+
)
|
| 989 |
+
|
| 990 |
+
|
| 991 |
def _append_phrases_to_vocabulary_jsonl(lang: str, pairs_text: str) -> None:
|
| 992 |
"""Append phrase pairs to vocabulary.jsonl in the feedback repo (training input)."""
|
| 993 |
if _hf_api is None or not FEEDBACK_REPO_ID:
|
|
|
|
| 2020 |
outputs=[yt_status],
|
| 2021 |
)
|
| 2022 |
|
| 2023 |
+
# ── Document upload (PDF / Word / TXT) ───────────────────────
|
| 2024 |
+
gr.Markdown("---")
|
| 2025 |
+
gr.Markdown(
|
| 2026 |
+
"### 📄 Upload documents (PDF, Word, TXT)\n"
|
| 2027 |
+
"Extract sentences from books, articles, or lesson PDFs. "
|
| 2028 |
+
"Each sentence is added to the training vocabulary in the language you select below. "
|
| 2029 |
+
"**Upload one batch per language** — do not mix Bambara and Fula files in one upload."
|
| 2030 |
+
)
|
| 2031 |
+
with gr.Row():
|
| 2032 |
+
with gr.Column():
|
| 2033 |
+
doc_lang = gr.Dropdown(
|
| 2034 |
+
choices=["Bambara (bam)", "Fula (ful)"],
|
| 2035 |
+
value="Fula (ful)",
|
| 2036 |
+
label="Language of these documents",
|
| 2037 |
+
)
|
| 2038 |
+
doc_files = gr.File(
|
| 2039 |
+
label="Upload .pdf, .docx, or .txt (multiple allowed)",
|
| 2040 |
+
file_count="multiple",
|
| 2041 |
+
file_types=[".pdf", ".docx", ".doc", ".txt", ".md"],
|
| 2042 |
+
)
|
| 2043 |
+
doc_source = gr.Textbox(
|
| 2044 |
+
placeholder="e.g. SIL Pular grammar book, Labé lesson PDFs",
|
| 2045 |
+
label="Source note (optional — for your records)",
|
| 2046 |
+
)
|
| 2047 |
+
doc_btn = gr.Button("📥 Extract & Add to Training Data", variant="primary")
|
| 2048 |
+
with gr.Column():
|
| 2049 |
+
doc_status = gr.Textbox(
|
| 2050 |
+
label="Import status",
|
| 2051 |
+
interactive=False,
|
| 2052 |
+
lines=12,
|
| 2053 |
+
)
|
| 2054 |
+
|
| 2055 |
+
doc_btn.click(
|
| 2056 |
+
fn=_import_documents,
|
| 2057 |
+
inputs=[doc_lang, doc_files, doc_source],
|
| 2058 |
+
outputs=[doc_status],
|
| 2059 |
+
)
|
| 2060 |
+
|
| 2061 |
# ── Tab 4: Model Training ─────────────────────────────────────────
|
| 2062 |
with gr.TabItem("🔧 Model Training"):
|
| 2063 |
gr.Markdown(
|
|
|
|
| 2210 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 2211 |
FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedback")
|
| 2212 |
ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
|
| 2213 |
+
WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-large-v3-turbo")
|
| 2214 |
LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
|
| 2215 |
|
| 2216 |
if HF_TOKEN:
|
|
@@ -44,3 +44,7 @@ rapidfuzz==3.13.0
|
|
| 44 |
|
| 45 |
# Kaggle API (used by Self-Teaching tab to trigger training runs)
|
| 46 |
kaggle>=1.6.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Kaggle API (used by Self-Teaching tab to trigger training runs)
|
| 46 |
kaggle>=1.6.0
|
| 47 |
+
|
| 48 |
+
# Document parsing for Knowledge Base tab (PDF/Word/TXT upload → vocabulary.jsonl)
|
| 49 |
+
pypdf>=4.0.0
|
| 50 |
+
python-docx>=1.1.0
|