Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files
config.py
CHANGED
|
@@ -3,15 +3,7 @@ from typing import List, Dict
|
|
| 3 |
|
| 4 |
|
| 5 |
class SanatanConfig:
|
| 6 |
-
# shuklaYajurVedamPdfPath: str = "./data/shukla-yajur-veda.pdf"
|
| 7 |
-
# shuklaYajurVedamSmallPdfPath: str = "./data/shukla-yajur-veda-small.pdf"
|
| 8 |
-
# vishnuPuranamPdfPath = "./data/vishnu_puranam.pdf"
|
| 9 |
-
# datastores = [{"name": "sanskrit_001", "dbStorePath": "./chromadb-store"}, {"name": "nalayiram", "dbStorePath": "./chromadb-store-4000"}]
|
| 10 |
dbStorePath: str = "./chromadb-store"
|
| 11 |
-
# shuklaYajurVedamCollectionName: str = "shukla_yajur_vedam"
|
| 12 |
-
# vishnuPuranamCollectionName: str = "vishnu_puranam"
|
| 13 |
-
# shuklaYajurVedamOutputDir = "./output/shukla_yajur_vedam"
|
| 14 |
-
# vishnuPuranamOutputDir = "./output/vishnu_puranam"
|
| 15 |
scriptures = [
|
| 16 |
{
|
| 17 |
"name": "vishnu_puranam",
|
|
@@ -203,6 +195,29 @@ class SanatanConfig:
|
|
| 203 |
"collection_name": "divya_prabandham",
|
| 204 |
"collection_embedding_fn": "openai",
|
| 205 |
"unit": "verse",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
"metadata_fields": [
|
| 207 |
{
|
| 208 |
"name": "prabandham_code",
|
|
@@ -246,7 +261,7 @@ class SanatanConfig:
|
|
| 246 |
{
|
| 247 |
"name": "verse",
|
| 248 |
"datatype": "int",
|
| 249 |
-
"is_unique"
|
| 250 |
"description": (
|
| 251 |
"Absolute verse number or pasuram number. Each verse has a unique number."
|
| 252 |
# "Use it only when a specific prabandham name is NOT mentioned in the user query."
|
|
@@ -574,3 +589,60 @@ class SanatanConfig:
|
|
| 574 |
for s in self.scriptures:
|
| 575 |
filtered.append({k: s[k] for k in fields_to_keep if k in s})
|
| 576 |
return filtered
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
class SanatanConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
dbStorePath: str = "./chromadb-store"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
scriptures = [
|
| 8 |
{
|
| 9 |
"name": "vishnu_puranam",
|
|
|
|
| 195 |
"collection_name": "divya_prabandham",
|
| 196 |
"collection_embedding_fn": "openai",
|
| 197 |
"unit": "verse",
|
| 198 |
+
"field_mapping": {
|
| 199 |
+
"text": "pasuram_ta",
|
| 200 |
+
"title": lambda doc: f"{doc.get('prabandham_name','')} {doc.get('chapter','')}-{doc.get('decade','')}:{doc.get('position_in_chapter','')}",
|
| 201 |
+
"word_by_word_native": "wbw_ta",
|
| 202 |
+
"unit_index": "verse",
|
| 203 |
+
"transliteration": "pasuram_en",
|
| 204 |
+
"reference_link": "html_url",
|
| 205 |
+
"author": "azhwar_name",
|
| 206 |
+
"chapter_name": "prabandham_name",
|
| 207 |
+
"relative_path": lambda doc: "-".join(
|
| 208 |
+
filter(
|
| 209 |
+
None,
|
| 210 |
+
[
|
| 211 |
+
doc.get("prabandham_name", ""),
|
| 212 |
+
*(
|
| 213 |
+
str(doc.get(k))
|
| 214 |
+
for k in ["decade", "chapter", "position_in_chapter"]
|
| 215 |
+
if doc.get(k, -1) != -1
|
| 216 |
+
),
|
| 217 |
+
],
|
| 218 |
+
)
|
| 219 |
+
),
|
| 220 |
+
},
|
| 221 |
"metadata_fields": [
|
| 222 |
{
|
| 223 |
"name": "prabandham_code",
|
|
|
|
| 261 |
{
|
| 262 |
"name": "verse",
|
| 263 |
"datatype": "int",
|
| 264 |
+
"is_unique": True,
|
| 265 |
"description": (
|
| 266 |
"Absolute verse number or pasuram number. Each verse has a unique number."
|
| 267 |
# "Use it only when a specific prabandham name is NOT mentioned in the user query."
|
|
|
|
| 589 |
for s in self.scriptures:
|
| 590 |
filtered.append({k: s[k] for k in fields_to_keep if k in s})
|
| 591 |
return filtered
|
| 592 |
+
|
| 593 |
+
def canonicalize_document(
|
| 594 |
+
self, scripture_name: str, document_text: str, metadata_doc: dict
|
| 595 |
+
):
|
| 596 |
+
"""
|
| 597 |
+
Convert scripture-specific document to a flattened canonical form.
|
| 598 |
+
Supports static strings or lambdas in field mapping.
|
| 599 |
+
Only allows keys from the allowed canonical fields list.
|
| 600 |
+
"""
|
| 601 |
+
allowed_keys = {
|
| 602 |
+
"verse",
|
| 603 |
+
"text",
|
| 604 |
+
"title",
|
| 605 |
+
"unit",
|
| 606 |
+
"unit_index",
|
| 607 |
+
"word_by_word_native",
|
| 608 |
+
"transliteration",
|
| 609 |
+
"reference_link",
|
| 610 |
+
"author",
|
| 611 |
+
"chapter_name",
|
| 612 |
+
"relative_path",
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
|
| 616 |
+
if not config:
|
| 617 |
+
raise ValueError(f"Unknown scripture: {scripture_name}")
|
| 618 |
+
|
| 619 |
+
mapping = config.get("field_mapping", {})
|
| 620 |
+
|
| 621 |
+
def resolve_field(field):
|
| 622 |
+
"""Resolve a field: string key or lambda"""
|
| 623 |
+
if callable(field):
|
| 624 |
+
try:
|
| 625 |
+
return field(metadata_doc)
|
| 626 |
+
except Exception:
|
| 627 |
+
return None
|
| 628 |
+
elif isinstance(field, str):
|
| 629 |
+
return metadata_doc.get(field)
|
| 630 |
+
return None
|
| 631 |
+
|
| 632 |
+
canonical_doc = {}
|
| 633 |
+
for key, field in mapping.items():
|
| 634 |
+
if key in allowed_keys: # only include allowed canonical keys
|
| 635 |
+
canonical_doc[key] = resolve_field(field)
|
| 636 |
+
|
| 637 |
+
# optionally add global fields from config
|
| 638 |
+
canonical_doc["scripture_name"] = config.get("name")
|
| 639 |
+
canonical_doc["scripture_title"] = config.get("title")
|
| 640 |
+
canonical_doc["source"] = config.get("source")
|
| 641 |
+
canonical_doc["language"] = config.get("language")
|
| 642 |
+
canonical_doc["unit"] = config.get("unit")
|
| 643 |
+
canonical_doc["document"] = document_text
|
| 644 |
+
if canonical_doc["text"] == "-" or canonical_doc["text"] is None:
|
| 645 |
+
canonical_doc["text"] = canonical_doc["document"]
|
| 646 |
+
canonical_doc["verse"] = resolve_field("verse")
|
| 647 |
+
|
| 648 |
+
return canonical_doc
|
db.py
CHANGED
|
@@ -112,6 +112,50 @@ class SanatanDatabase:
|
|
| 112 |
n_results=n_results,
|
| 113 |
)
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
def search_semantic(
|
| 116 |
self,
|
| 117 |
collection_name: str,
|
|
|
|
| 112 |
n_results=n_results,
|
| 113 |
)
|
| 114 |
|
| 115 |
+
def fetch_document_by_index(self, collection_name: str, index: int, unit_name : str):
|
| 116 |
+
"""
|
| 117 |
+
Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
collection_name: Name of the ChromaDB collection.
|
| 121 |
+
index: Zero-based index of the document to fetch.
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
dict: {
|
| 125 |
+
"document": <document_text>,
|
| 126 |
+
<metadata_key_1>: <value>,
|
| 127 |
+
<metadata_key_2>: <value>,
|
| 128 |
+
...
|
| 129 |
+
}
|
| 130 |
+
Or a dict with "error" key if something went wrong.
|
| 131 |
+
"""
|
| 132 |
+
logger.info("Fetch document #%d from [%s]", index, collection_name)
|
| 133 |
+
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
response = collection.get(
|
| 137 |
+
limit=1,
|
| 138 |
+
# offset=index, # pagination via offset
|
| 139 |
+
include=["metadatas", "documents"],
|
| 140 |
+
where={unit_name: index}
|
| 141 |
+
)
|
| 142 |
+
except Exception as e:
|
| 143 |
+
logger.error("Error fetching document: %s", e)
|
| 144 |
+
return {"error": f"There was an error fetching the document: {str(e)}"}
|
| 145 |
+
|
| 146 |
+
documents = response.get("documents", [])
|
| 147 |
+
metadatas = response.get("metadatas", [])
|
| 148 |
+
|
| 149 |
+
if documents:
|
| 150 |
+
# merge document text with metadata
|
| 151 |
+
result = {"document": documents[0]}
|
| 152 |
+
if metadatas:
|
| 153 |
+
result.update(metadatas[0])
|
| 154 |
+
return result
|
| 155 |
+
else:
|
| 156 |
+
return {"error": "No data available."}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
def search_semantic(
|
| 160 |
self,
|
| 161 |
collection_name: str,
|
main.py
CHANGED
|
@@ -31,4 +31,4 @@ async def log_requests(request: Request, call_next):
|
|
| 31 |
return response
|
| 32 |
|
| 33 |
if __name__ == "__main__":
|
| 34 |
-
uvicorn.run("main:app", host="0.0.0.0", port=7860)
|
|
|
|
| 31 |
return response
|
| 32 |
|
| 33 |
if __name__ == "__main__":
|
| 34 |
+
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
|
server.py
CHANGED
|
@@ -180,10 +180,75 @@ async def handle_quiz_eval(payload: QuizEvalPayload, request: Request):
|
|
| 180 |
print(result.model_dump_json(indent=1))
|
| 181 |
return result
|
| 182 |
|
|
|
|
| 183 |
@router.get("/scriptures")
|
| 184 |
async def handle_get_scriptures():
|
| 185 |
return_values = {}
|
| 186 |
for scripture in SanatanConfig().scriptures:
|
| 187 |
-
if scripture[
|
| 188 |
-
return_values[scripture[
|
| 189 |
return return_values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
print(result.model_dump_json(indent=1))
|
| 181 |
return result
|
| 182 |
|
| 183 |
+
|
| 184 |
@router.get("/scriptures")
|
| 185 |
async def handle_get_scriptures():
|
| 186 |
return_values = {}
|
| 187 |
for scripture in SanatanConfig().scriptures:
|
| 188 |
+
if scripture["collection_name"] != "yt_metadata":
|
| 189 |
+
return_values[scripture["collection_name"]] = scripture["title"]
|
| 190 |
return return_values
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class ScriptureRequest(BaseModel):
|
| 194 |
+
scripture_name: str
|
| 195 |
+
unit_index: int
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
@router.post("/scripture")
|
| 199 |
+
async def get_scripture(req: ScriptureRequest):
|
| 200 |
+
"""
|
| 201 |
+
Return a scripture unit (page or verse, based on config),
|
| 202 |
+
including all metadata fields separately.
|
| 203 |
+
"""
|
| 204 |
+
print("received request to fetch scripture.", req)
|
| 205 |
+
|
| 206 |
+
# find config entry for the scripture
|
| 207 |
+
config = next(
|
| 208 |
+
(s for s in SanatanConfig().scriptures if s["name"] == req.scripture_name), None
|
| 209 |
+
)
|
| 210 |
+
if not config:
|
| 211 |
+
return {"error": f"Scripture '{req.scripture_name}' not found"}
|
| 212 |
+
|
| 213 |
+
# fetch the raw document from DB
|
| 214 |
+
raw_doc = SanatanDatabase().fetch_document_by_index(
|
| 215 |
+
collection_name=config["collection_name"],
|
| 216 |
+
index=req.unit_index,
|
| 217 |
+
unit_name=config["unit"]
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
if not raw_doc or isinstance(raw_doc, str):
|
| 221 |
+
return {"error": f"No data available for unit {req.unit_index}"}
|
| 222 |
+
|
| 223 |
+
# canonicalize it
|
| 224 |
+
canonical_doc = SanatanConfig().canonicalize_document(
|
| 225 |
+
scripture_name=req.scripture_name,
|
| 226 |
+
document_text=raw_doc.get("document", ""),
|
| 227 |
+
metadata_doc=raw_doc,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# add unit index & total units (so Flutter can paginate)
|
| 231 |
+
canonical_doc["unit_index"] = req.unit_index
|
| 232 |
+
canonical_doc["total"] = SanatanDatabase().count(config["collection_name"])
|
| 233 |
+
|
| 234 |
+
print("canonical_doc = ", canonical_doc)
|
| 235 |
+
return canonical_doc
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
@router.get("/scripture_configs")
|
| 239 |
+
async def get_scripture_configs():
|
| 240 |
+
scriptures = []
|
| 241 |
+
for s in SanatanConfig().scriptures:
|
| 242 |
+
num_units = SanatanDatabase().count(
|
| 243 |
+
collection_name=s["collection_name"]
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
scriptures.append(
|
| 247 |
+
{
|
| 248 |
+
"name": s["name"], # e.g. "bhagavad_gita"
|
| 249 |
+
"title": s["title"], # e.g. "Bhagavad Gita"
|
| 250 |
+
"unit": s["unit"], # e.g. "verse" or "page"
|
| 251 |
+
"total" : num_units
|
| 252 |
+
}
|
| 253 |
+
)
|
| 254 |
+
return {"scriptures": scriptures}
|