Spaces:
Sleeping
Sleeping
Update src/rag_engine.py
Browse filesadded a function that links the flattened context to the knowledge base
- src/rag_engine.py +46 -1
src/rag_engine.py
CHANGED
|
@@ -204,4 +204,49 @@ def reset_knowledge_base(username):
|
|
| 204 |
if os.path.exists(user_db_path):
|
| 205 |
shutil.rmtree(user_db_path)
|
| 206 |
return True, "Database Reset."
|
| 207 |
-
return False, "Database already empty."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
if os.path.exists(user_db_path):
|
| 205 |
shutil.rmtree(user_db_path)
|
| 206 |
return True, "Database Reset."
|
| 207 |
+
return False, "Database already empty."
|
| 208 |
+
|
| 209 |
+
def process_and_add_text(raw_text, source_name, username, strategy="paragraph"):
|
| 210 |
+
"""
|
| 211 |
+
Directly indexes a raw text string into the user's vector DB.
|
| 212 |
+
Useful for indexing content generated by the LLM (like flattened notes).
|
| 213 |
+
"""
|
| 214 |
+
user_db_path = os.path.join(CHROMA_PATH, username)
|
| 215 |
+
|
| 216 |
+
try:
|
| 217 |
+
if not raw_text or not raw_text.strip():
|
| 218 |
+
return False, "Content appears empty."
|
| 219 |
+
|
| 220 |
+
# 1. CHUNK TEXT (Reusing the standard logic)
|
| 221 |
+
chunks = []
|
| 222 |
+
if strategy == "paragraph":
|
| 223 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 224 |
+
chunks = splitter.split_text(raw_text)
|
| 225 |
+
elif strategy == "token":
|
| 226 |
+
splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 227 |
+
chunks = splitter.split_text(raw_text)
|
| 228 |
+
elif strategy == "page":
|
| 229 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
|
| 230 |
+
chunks = splitter.split_text(raw_text)
|
| 231 |
+
|
| 232 |
+
# 2. CREATE DOCUMENTS
|
| 233 |
+
# We append "_flattened" to the source name so you can distinguish it from the original
|
| 234 |
+
docs = [
|
| 235 |
+
Document(
|
| 236 |
+
page_content=chunk,
|
| 237 |
+
metadata={"source": source_name, "strategy": f"{strategy}-flattened"}
|
| 238 |
+
)
|
| 239 |
+
for chunk in chunks
|
| 240 |
+
]
|
| 241 |
+
|
| 242 |
+
# 3. INDEX TO CHROMA
|
| 243 |
+
if docs:
|
| 244 |
+
emb_fn = get_embedding_func()
|
| 245 |
+
db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
|
| 246 |
+
db.add_documents(docs)
|
| 247 |
+
return True, f"Successfully indexed {len(docs)} flattened chunks."
|
| 248 |
+
else:
|
| 249 |
+
return False, "No chunks created."
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
return False, f"Error processing text: {e}"
|