HamidOmarov commited on
Commit
af613b6
·
1 Parent(s): 4448508

RAG: ftfy + AZ spacing fix, pdfminer fallback; smarter synthesis

Browse files
Files changed (1) hide show
  1. app/rag_system.py +146 -107
app/rag_system.py CHANGED
@@ -4,122 +4,118 @@ from __future__ import annotations
4
  import os
5
  import re
6
  from pathlib import Path
7
- from typing import List, Tuple
8
 
9
  import faiss
10
  import numpy as np
 
11
 
12
  # Prefer pypdf; fallback to PyPDF2 if needed
13
  try:
14
- from pypdf import PdfReader
15
- except Exception:
16
- from PyPDF2 import PdfReader # type: ignore
 
 
 
17
 
 
18
  from sentence_transformers import SentenceTransformer
19
- from ftfy import fix_text
20
 
21
  # ---------------- Paths & Cache (HF-safe) ----------------
22
- ROOT_DIR = Path(os.getenv("APP_ROOT", "/app"))
23
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
24
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
25
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
26
- CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache")))
27
 
28
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
29
  d.mkdir(parents=True, exist_ok=True)
30
 
 
31
  # ---------------- Config ----------------
32
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
33
- OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").lower()
34
 
35
- # ---------------- Helpers ----------------
36
- AZ_CHARS = set("əğıöşçüİıĞÖŞÇÜƏ")
37
- NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|set|mt)\b", re.IGNORECASE)
38
 
39
- AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇç"
 
 
40
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
41
 
42
  def _fix_intra_word_spaces(s: str) -> str:
43
- # "H Ə F T Ə" -> "HƏFTƏ"
44
  if not s:
45
  return s
46
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
47
 
48
  def _fix_mojibake(s: str) -> str:
49
- # Try to undo latin-1/utf-8 mess, then ftfy as final pass
50
  if not s:
51
  return s
52
- try:
53
- if any(ch in s for ch in ("Ã", "Ä", "Å", "Ð", "Þ", "þ")):
54
  s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
55
- except Exception:
56
- pass
57
- s = fix_text(s)
58
- s = _fix_intra_word_spaces(s)
59
- return s
60
 
61
- def _split_sentences(text: str) -> List[str]:
62
- return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
 
63
 
64
- def _mostly_numeric(s: str) -> bool:
65
- alnum = [c for c in s if c.isalnum()]
66
- if not alnum:
67
- return True
68
- digits = sum(c.isdigit() for c in alnum)
69
- return digits / max(1, len(alnum)) > 0.3
70
 
71
- def _tabular_like(s: str) -> bool:
72
- hits = len(NUM_TOKEN_RE.findall(s))
73
- return hits >= 2 or "Page" in s or len(s) < 20
74
 
75
- def _clean_for_summary(text: str) -> str:
76
  out = []
77
  for ln in text.splitlines():
78
  t = " ".join(ln.split())
79
- t = _fix_mojibake(t)
80
  if not t or _mostly_numeric(t) or _tabular_like(t):
81
  continue
82
  out.append(t)
83
  return " ".join(out)
84
 
85
- def _sim_jaccard(a: str, b: str) -> float:
86
- aw = set(a.lower().split()); bw = set(b.lower().split())
87
- if not aw or not bw:
88
- return 0.0
89
- return len(aw & bw) / len(aw | bw)
90
 
91
  STOPWORDS = {
92
  "the","a","an","and","or","of","to","in","on","for","with","by",
93
  "this","that","these","those","is","are","was","were","be","been","being",
94
  "at","as","it","its","from","into","about","over","after","before","than",
95
- "such","can","could","should","would","may","might","will","shall"
96
  }
 
97
  def _keywords(text: str) -> List[str]:
98
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
99
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
100
 
101
- def _looks_azerbaijani(s: str) -> bool:
102
- has_az = any(ch in AZ_CHARS for ch in s)
103
- non_ascii_ratio = sum(ord(c) > 127 for c in s) / max(1, len(s))
104
- return has_az or non_ascii_ratio > 0.15
105
-
106
- # ---- Descoped/out-of-scope heuristics ----
107
- DESCOPED_KWS = [
108
- "descoped","out of scope","out-of-scope","exclude","excluded","exclusion",
109
- "çıxarılan","çıxarıl","çıxarıldı","daxil deyil","sökül","demontaj","kəsilmə",
110
- ]
111
- def _descoped_mode(question: str) -> bool:
112
- ql = (question or "").lower()
113
- return any(k in ql for k in DESCOPED_KWS) or "descop" in ql
114
 
115
- def _is_descoped_line(s: str) -> bool:
116
- sl = s.lower()
117
- if any(k in sl for k in DESCOPED_KWS):
118
- return True
119
- return bool(re.search(r"\b(out[-\s]?of[-\s]?scope|descop)", sl))
120
 
121
  # ---------------- RAG Core ----------------
122
  class SimpleRAG:
 
 
 
 
 
 
 
123
  def __init__(
124
  self,
125
  index_path: Path = INDEX_DIR / "faiss.index",
@@ -138,7 +134,7 @@ class SimpleRAG:
138
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
139
  self.chunks: List[str] = []
140
  self.last_added: List[str] = []
141
- self._translator = None # lazy
142
 
143
  self._load()
144
 
@@ -161,22 +157,67 @@ class SimpleRAG:
161
  faiss.write_index(self.index, str(self.index_path))
162
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
163
 
164
- # ---------- Utilities ----------
165
  @property
166
  def is_empty(self) -> bool:
167
  return getattr(self.index, "ntotal", 0) == 0 or not self.chunks
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  @staticmethod
170
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
171
- reader = PdfReader(str(pdf_path))
172
- pages: List[str] = []
173
- for p in reader.pages:
174
- t = p.extract_text() or ""
175
- t = _fix_mojibake(t)
176
- if t.strip():
177
- pages.append(t)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  chunks: List[str] = []
179
- for txt in pages:
180
  for i in range(0, len(txt), step):
181
  part = txt[i : i + step].strip()
182
  if part:
@@ -188,6 +229,9 @@ class SimpleRAG:
188
  texts = self._pdf_to_texts(pdf_path)
189
  if not texts:
190
  return 0
 
 
 
191
  emb = self.model.encode(
192
  texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
193
  )
@@ -202,7 +246,7 @@ class SimpleRAG:
202
  if self.is_empty:
203
  return []
204
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
205
- k = max(1, min(int(k or 5), getattr(self.index, "ntotal", 1)))
206
  D, I = self.index.search(q, k)
207
  out: List[Tuple[str, float]] = []
208
  if I.size > 0 and self.chunks:
@@ -216,7 +260,7 @@ class SimpleRAG:
216
  if not texts:
217
  return texts
218
  try:
219
- from transformers import pipeline
220
  if self._translator is None:
221
  self._translator = pipeline(
222
  "translation",
@@ -225,35 +269,35 @@ class SimpleRAG:
225
  device=-1,
226
  )
227
  outs = self._translator(texts, max_length=400)
228
- return [fix_text(o["translation_text"].strip()) for o in outs]
229
  except Exception:
230
- return texts
231
 
232
  # ---------- Fallbacks ----------
233
- def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4, allow_numeric: bool = False) -> List[str]:
234
  qk = set(_keywords(question))
235
  if not qk:
236
  return []
237
  candidates: List[Tuple[float, str]] = []
238
- for text in pool[:400]:
239
- cleaned = _fix_mojibake(" ".join(text.split()))
240
  for s in _split_sentences(cleaned):
241
- if not allow_numeric:
242
- if _tabular_like(s) or _mostly_numeric(s):
243
- continue
244
  toks = set(_keywords(s))
245
  if not toks:
246
  continue
247
  overlap = len(qk & toks)
248
- if overlap == 0 and not _is_descoped_line(s):
249
  continue
250
- length_penalty = max(6, min(60, len(s.split())))
251
- score = overlap + (0.3 if _is_descoped_line(s) else 0.0) + min(0.5, overlap / length_penalty)
252
  candidates.append((score, s))
253
  candidates.sort(key=lambda x: x[0], reverse=True)
 
254
  out: List[str] = []
255
  for _, s in candidates:
256
- s = fix_text(s).strip()
257
  if any(_sim_jaccard(s, t) >= 0.82 for t in out):
258
  continue
259
  out.append(s)
@@ -266,24 +310,17 @@ class SimpleRAG:
266
  if not contexts and self.is_empty:
267
  return "No relevant context found. Index is empty — upload a PDF first."
268
 
269
- desc_mode = _descoped_mode(question)
 
270
 
271
- # Build candidate sentences from nearby contexts
272
  local_pool: List[str] = []
273
- scan_n = 8 if desc_mode else 5
274
- for c in (contexts or [])[:scan_n]:
275
- cleaned = _fix_mojibake(" ".join(c.split()))
276
  for s in _split_sentences(cleaned):
277
  w = s.split()
278
- if not ( (6 if desc_mode else 8) <= len(w) <= (60 if desc_mode else 35) ):
279
  continue
280
- if not desc_mode:
281
- if _tabular_like(s) or _mostly_numeric(s):
282
- continue
283
- else:
284
- # allow numeric/tabular if it looks like descoped line
285
- if not _is_descoped_line(s) and (_tabular_like(s) or _mostly_numeric(s)):
286
- continue
287
  local_pool.append(" ".join(w))
288
 
289
  selected: List[str] = []
@@ -293,32 +330,34 @@ class SimpleRAG:
293
  scores = (cand_emb @ q_emb.T).ravel()
294
  order = np.argsort(-scores)
295
  for i in order:
296
- s = fix_text(local_pool[i]).strip()
297
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
298
  continue
299
  selected.append(s)
300
  if len(selected) >= max_sentences:
301
  break
302
 
 
303
  if not selected:
304
- selected = self._keyword_fallback(
305
- question,
306
- self.chunks,
307
- limit_sentences=max_sentences,
308
- allow_numeric=desc_mode, # relax numeric filter for descoped Qs
309
- )
310
 
311
  if not selected:
312
  return "No readable sentences matched the question. Try a more specific query."
313
 
314
- # Translate to EN if needed (and requested)
315
- if OUTPUT_LANG == "en":
316
- needs_tr = any(_looks_azerbaijani(s) for s in selected) or any(ch in "".join(selected) for ch in ("Ã","Ä","Þ"))
317
- if needs_tr:
318
  selected = self._translate_to_en(selected)
 
 
319
 
320
  bullets = "\n".join(f"- {s}" for s in selected)
321
  return f"Answer (based on document context):\n{bullets}"
322
 
323
 
324
- __all__ = ["SimpleRAG", "UPLOAD_DIR", "INDEX_DIR"]
 
 
 
 
 
 
4
  import os
5
  import re
6
  from pathlib import Path
7
+ from typing import List, Tuple, Optional
8
 
9
  import faiss
10
  import numpy as np
11
+ from ftfy import fix_text as _ftfy_fix
12
 
13
  # Prefer pypdf; fallback to PyPDF2 if needed
14
  try:
15
+ from pypdf import PdfReader # type: ignore
16
+ except Exception: # pragma: no cover
17
+ try:
18
+ from PyPDF2 import PdfReader # type: ignore
19
+ except Exception: # pragma: no cover
20
+ PdfReader = None # will try pdfminer if available
21
 
22
+ # sentence-transformers encoder
23
  from sentence_transformers import SentenceTransformer
24
+
25
 
26
  # ---------------- Paths & Cache (HF-safe) ----------------
27
+ ROOT_DIR = Path(os.getenv("APP_ROOT", "/app")) # HF Spaces writeable base
28
  DATA_DIR = Path(os.getenv("DATA_DIR", str(ROOT_DIR / "data")))
29
  UPLOAD_DIR = Path(os.getenv("UPLOAD_DIR", str(DATA_DIR / "uploads")))
30
  INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_DIR / "index")))
31
+ CACHE_DIR = Path(os.getenv("HF_HOME", str(ROOT_DIR / ".cache"))) # transformers uses HF_HOME
32
 
33
  for d in (DATA_DIR, UPLOAD_DIR, INDEX_DIR, CACHE_DIR):
34
  d.mkdir(parents=True, exist_ok=True)
35
 
36
+
37
  # ---------------- Config ----------------
38
  MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
39
+ OUTPUT_LANG = os.getenv("OUTPUT_LANG", "en").strip().lower() # "en" → translate AZ→EN
40
 
 
 
 
41
 
42
+ # ---------------- Text helpers ----------------
43
+ # Join AZ letters split by spaces (e.g., "H Ə F T Ə" → "HƏFTƏ")
44
+ AZ_LATIN = "A-Za-zƏəĞğİıÖöŞşÇçÜü"
45
  _SINGLE_LETTER_RUN = re.compile(rf"\b(?:[{AZ_LATIN}]\s+){{2,}}[{AZ_LATIN}]\b")
46
 
47
  def _fix_intra_word_spaces(s: str) -> str:
 
48
  if not s:
49
  return s
50
  return _SINGLE_LETTER_RUN.sub(lambda m: re.sub(r"\s+", "", m.group(0)), s)
51
 
52
  def _fix_mojibake(s: str) -> str:
53
+ """Fix common UTF-8-as-Latin-1 mojibake quickly; then ftfy."""
54
  if not s:
55
  return s
56
+ if any(sym in s for sym in ("Ã", "Ä", "Å", "Ð", "Þ", "þ", "â")):
57
+ try:
58
  s = s.encode("latin-1", "ignore").decode("utf-8", "ignore")
59
+ except Exception:
60
+ pass
61
+ # ftfy final pass (safe on already-correct text)
62
+ return _ftfy_fix(s)
 
63
 
64
+ def _clean_for_summary(text: str) -> str:
65
+ """Remove ultra-short / numeric / tabular-ish lines, collapse spaces."""
66
+ NUM_TOKEN_RE = re.compile(r"\b(\d+[.,]?\d*|%|m²|azn|usd|eur|mt|m2)\b", re.IGNORECASE)
67
 
68
+ def _mostly_numeric(s: str) -> bool:
69
+ alnum = [c for c in s if c.isalnum()]
70
+ if not alnum:
71
+ return True
72
+ digits = sum(c.isdigit() for c in alnum)
73
+ return digits / max(1, len(alnum)) > 0.30
74
 
75
+ def _tabular_like(s: str) -> bool:
76
+ hits = len(NUM_TOKEN_RE.findall(s))
77
+ return hits >= 2 or "Page" in s or len(s) < 20
78
 
 
79
  out = []
80
  for ln in text.splitlines():
81
  t = " ".join(ln.split())
 
82
  if not t or _mostly_numeric(t) or _tabular_like(t):
83
  continue
84
  out.append(t)
85
  return " ".join(out)
86
 
87
+ def _split_sentences(text: str) -> List[str]:
88
+ # simple splitter ok for extractive snippets
89
+ return [s.strip() for s in re.split(r"(?<=[\.!\?])\s+|[\r\n]+", text) if s.strip()]
 
 
90
 
91
  STOPWORDS = {
92
  "the","a","an","and","or","of","to","in","on","for","with","by",
93
  "this","that","these","those","is","are","was","were","be","been","being",
94
  "at","as","it","its","from","into","about","over","after","before","than",
95
+ "such","can","could","should","would","may","might","will","shall",
96
  }
97
+
98
  def _keywords(text: str) -> List[str]:
99
  toks = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ0-9]+", text.lower())
100
  return [t for t in toks if t not in STOPWORDS and len(t) > 2]
101
 
102
+ def _sim_jaccard(a: str, b: str) -> float:
103
+ aw = set(a.lower().split())
104
+ bw = set(b.lower().split())
105
+ if not aw or not bw:
106
+ return 0.0
107
+ return len(aw & bw) / len(aw | bw)
 
 
 
 
 
 
 
108
 
 
 
 
 
 
109
 
110
  # ---------------- RAG Core ----------------
111
  class SimpleRAG:
112
+ """
113
+ Minimal RAG core:
114
+ - FAISS (IP) over sentence-transformers embeddings
115
+ - PDF → texts with robust decoding (pypdf/PyPDF2 + ftfy; optional pdfminer fallback)
116
+ - Extractive answer synthesis with embedding ranking + keyword fallback
117
+ """
118
+
119
  def __init__(
120
  self,
121
  index_path: Path = INDEX_DIR / "faiss.index",
 
134
  self.index: faiss.Index = faiss.IndexFlatIP(self.embed_dim)
135
  self.chunks: List[str] = []
136
  self.last_added: List[str] = []
137
+ self._translator = None # lazy init
138
 
139
  self._load()
140
 
 
157
  faiss.write_index(self.index, str(self.index_path))
158
  np.save(self.meta_path, np.array(self.chunks, dtype=object))
159
 
160
+ # ---------- Public utils ----------
161
  @property
162
  def is_empty(self) -> bool:
163
  return getattr(self.index, "ntotal", 0) == 0 or not self.chunks
164
 
165
+ @property
166
+ def faiss_ntotal(self) -> int:
167
+ return int(getattr(self.index, "ntotal", 0))
168
+
169
+ @property
170
+ def model_dim(self) -> int:
171
+ return int(self.embed_dim)
172
+
173
+ def reset_index(self) -> None:
174
+ self.index = faiss.IndexFlatIP(self.embed_dim)
175
+ self.chunks = []
176
+ self.last_added = []
177
+ try:
178
+ if self.index_path.exists():
179
+ self.index_path.unlink()
180
+ except Exception:
181
+ pass
182
+ try:
183
+ if self.meta_path.exists():
184
+ self.meta_path.unlink()
185
+ except Exception:
186
+ pass
187
+
188
+ # ---------- PDF → texts ----------
189
  @staticmethod
190
  def _pdf_to_texts(pdf_path: Path, step: int = 800) -> List[str]:
191
+ texts: List[str] = []
192
+
193
+ # A) pypdf / PyPDF2
194
+ if PdfReader is not None:
195
+ try:
196
+ reader = PdfReader(str(pdf_path))
197
+ for p in getattr(reader, "pages", []):
198
+ t = p.extract_text() or ""
199
+ t = _fix_mojibake(t)
200
+ t = _fix_intra_word_spaces(t)
201
+ if t.strip():
202
+ texts.append(t)
203
+ except Exception:
204
+ pass
205
+
206
+ # B) Optional pdfminer fallback if nothing extracted
207
+ if not texts:
208
+ try:
209
+ from pdfminer.high_level import extract_text # type: ignore
210
+ raw = extract_text(str(pdf_path)) or ""
211
+ raw = _fix_mojibake(raw)
212
+ raw = _fix_intra_word_spaces(raw)
213
+ if raw.strip():
214
+ texts = [raw]
215
+ except Exception:
216
+ pass
217
+
218
+ # Split to fixed-size chunks (simple & fast)
219
  chunks: List[str] = []
220
+ for txt in texts:
221
  for i in range(0, len(txt), step):
222
  part = txt[i : i + step].strip()
223
  if part:
 
229
  texts = self._pdf_to_texts(pdf_path)
230
  if not texts:
231
  return 0
232
+ # final cleaning for safety
233
+ texts = [_fix_mojibake(_fix_intra_word_spaces(t)) for t in texts]
234
+
235
  emb = self.model.encode(
236
  texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
237
  )
 
246
  if self.is_empty:
247
  return []
248
  q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
249
+ k = max(1, min(int(k or 5), self.faiss_ntotal or 1))
250
  D, I = self.index.search(q, k)
251
  out: List[Tuple[str, float]] = []
252
  if I.size > 0 and self.chunks:
 
260
  if not texts:
261
  return texts
262
  try:
263
+ from transformers import pipeline # lazy import
264
  if self._translator is None:
265
  self._translator = pipeline(
266
  "translation",
 
269
  device=-1,
270
  )
271
  outs = self._translator(texts, max_length=400)
272
+ return [o["translation_text"].strip() for o in outs]
273
  except Exception:
274
+ return texts # graceful fallback
275
 
276
  # ---------- Fallbacks ----------
277
+ def _keyword_fallback(self, question: str, pool: List[str], limit_sentences: int = 4) -> List[str]:
278
  qk = set(_keywords(question))
279
  if not qk:
280
  return []
281
  candidates: List[Tuple[float, str]] = []
282
+ for text in pool[:200]:
283
+ cleaned = _clean_for_summary(text)
284
  for s in _split_sentences(cleaned):
285
+ w = s.split()
286
+ if not (8 <= len(w) <= 40):
287
+ continue
288
  toks = set(_keywords(s))
289
  if not toks:
290
  continue
291
  overlap = len(qk & toks)
292
+ if overlap == 0:
293
  continue
294
+ length_penalty = max(8, min(40, len(w)))
295
+ score = overlap + min(0.5, overlap / length_penalty)
296
  candidates.append((score, s))
297
  candidates.sort(key=lambda x: x[0], reverse=True)
298
+
299
  out: List[str] = []
300
  for _, s in candidates:
 
301
  if any(_sim_jaccard(s, t) >= 0.82 for t in out):
302
  continue
303
  out.append(s)
 
310
  if not contexts and self.is_empty:
311
  return "No relevant context found. Index is empty — upload a PDF first."
312
 
313
+ # Strong decoding & spacing fixes on contexts
314
+ contexts = [_fix_mojibake(_fix_intra_word_spaces(c)) for c in (contexts or [])]
315
 
316
+ # Build candidate sentences from top contexts
317
  local_pool: List[str] = []
318
+ for c in (contexts or [])[:5]:
319
+ cleaned = _clean_for_summary(c)
 
320
  for s in _split_sentences(cleaned):
321
  w = s.split()
322
+ if not (8 <= len(w) <= 40):
323
  continue
 
 
 
 
 
 
 
324
  local_pool.append(" ".join(w))
325
 
326
  selected: List[str] = []
 
330
  scores = (cand_emb @ q_emb.T).ravel()
331
  order = np.argsort(-scores)
332
  for i in order:
333
+ s = local_pool[i].strip()
334
  if any(_sim_jaccard(s, t) >= 0.82 for t in selected):
335
  continue
336
  selected.append(s)
337
  if len(selected) >= max_sentences:
338
  break
339
 
340
+ # Fallback via keywords over entire corpus
341
  if not selected:
342
+ selected = self._keyword_fallback(question, self.chunks, limit_sentences=max_sentences)
 
 
 
 
 
343
 
344
  if not selected:
345
  return "No readable sentences matched the question. Try a more specific query."
346
 
347
+ # Optional AZ→EN translate if output language is English and text is non-ASCII
348
+ if OUTPUT_LANG == "en" and any(ord(ch) > 127 for ch in " ".join(selected)):
349
+ try:
 
350
  selected = self._translate_to_en(selected)
351
+ except Exception:
352
+ pass
353
 
354
  bullets = "\n".join(f"- {s}" for s in selected)
355
  return f"Answer (based on document context):\n{bullets}"
356
 
357
 
358
+ # Public API
359
+ __all__ = [
360
+ "SimpleRAG",
361
+ "UPLOAD_DIR",
362
+ "INDEX_DIR",
363
+ ]