vikramvasudevan commited on
Commit
faece1b
·
verified ·
1 Parent(s): a00436d

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. config.py +81 -9
  2. db.py +44 -0
  3. main.py +1 -1
  4. server.py +67 -2
config.py CHANGED
@@ -3,15 +3,7 @@ from typing import List, Dict
3
 
4
 
5
  class SanatanConfig:
6
- # shuklaYajurVedamPdfPath: str = "./data/shukla-yajur-veda.pdf"
7
- # shuklaYajurVedamSmallPdfPath: str = "./data/shukla-yajur-veda-small.pdf"
8
- # vishnuPuranamPdfPath = "./data/vishnu_puranam.pdf"
9
- # datastores = [{"name": "sanskrit_001", "dbStorePath": "./chromadb-store"}, {"name": "nalayiram", "dbStorePath": "./chromadb-store-4000"}]
10
  dbStorePath: str = "./chromadb-store"
11
- # shuklaYajurVedamCollectionName: str = "shukla_yajur_vedam"
12
- # vishnuPuranamCollectionName: str = "vishnu_puranam"
13
- # shuklaYajurVedamOutputDir = "./output/shukla_yajur_vedam"
14
- # vishnuPuranamOutputDir = "./output/vishnu_puranam"
15
  scriptures = [
16
  {
17
  "name": "vishnu_puranam",
@@ -203,6 +195,29 @@ class SanatanConfig:
203
  "collection_name": "divya_prabandham",
204
  "collection_embedding_fn": "openai",
205
  "unit": "verse",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  "metadata_fields": [
207
  {
208
  "name": "prabandham_code",
@@ -246,7 +261,7 @@ class SanatanConfig:
246
  {
247
  "name": "verse",
248
  "datatype": "int",
249
- "is_unique" : True,
250
  "description": (
251
  "Absolute verse number or pasuram number. Each verse has a unique number."
252
  # "Use it only when a specific prabandham name is NOT mentioned in the user query."
@@ -574,3 +589,60 @@ class SanatanConfig:
574
  for s in self.scriptures:
575
  filtered.append({k: s[k] for k in fields_to_keep if k in s})
576
  return filtered
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  class SanatanConfig:
 
 
 
 
6
  dbStorePath: str = "./chromadb-store"
 
 
 
 
7
  scriptures = [
8
  {
9
  "name": "vishnu_puranam",
 
195
  "collection_name": "divya_prabandham",
196
  "collection_embedding_fn": "openai",
197
  "unit": "verse",
198
+ "field_mapping": {
199
+ "text": "pasuram_ta",
200
+ "title": lambda doc: f"{doc.get('prabandham_name','')} {doc.get('chapter','')}-{doc.get('decade','')}:{doc.get('position_in_chapter','')}",
201
+ "word_by_word_native": "wbw_ta",
202
+ "unit_index": "verse",
203
+ "transliteration": "pasuram_en",
204
+ "reference_link": "html_url",
205
+ "author": "azhwar_name",
206
+ "chapter_name": "prabandham_name",
207
+ "relative_path": lambda doc: "-".join(
208
+ filter(
209
+ None,
210
+ [
211
+ doc.get("prabandham_name", ""),
212
+ *(
213
+ str(doc.get(k))
214
+ for k in ["decade", "chapter", "position_in_chapter"]
215
+ if doc.get(k, -1) != -1
216
+ ),
217
+ ],
218
+ )
219
+ ),
220
+ },
221
  "metadata_fields": [
222
  {
223
  "name": "prabandham_code",
 
261
  {
262
  "name": "verse",
263
  "datatype": "int",
264
+ "is_unique": True,
265
  "description": (
266
  "Absolute verse number or pasuram number. Each verse has a unique number."
267
  # "Use it only when a specific prabandham name is NOT mentioned in the user query."
 
589
  for s in self.scriptures:
590
  filtered.append({k: s[k] for k in fields_to_keep if k in s})
591
  return filtered
592
+
593
+ def canonicalize_document(
594
+ self, scripture_name: str, document_text: str, metadata_doc: dict
595
+ ):
596
+ """
597
+ Convert scripture-specific document to a flattened canonical form.
598
+ Supports static strings or lambdas in field mapping.
599
+ Only allows keys from the allowed canonical fields list.
600
+ """
601
+ allowed_keys = {
602
+ "verse",
603
+ "text",
604
+ "title",
605
+ "unit",
606
+ "unit_index",
607
+ "word_by_word_native",
608
+ "transliteration",
609
+ "reference_link",
610
+ "author",
611
+ "chapter_name",
612
+ "relative_path",
613
+ }
614
+
615
+ config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
616
+ if not config:
617
+ raise ValueError(f"Unknown scripture: {scripture_name}")
618
+
619
+ mapping = config.get("field_mapping", {})
620
+
621
+ def resolve_field(field):
622
+ """Resolve a field: string key or lambda"""
623
+ if callable(field):
624
+ try:
625
+ return field(metadata_doc)
626
+ except Exception:
627
+ return None
628
+ elif isinstance(field, str):
629
+ return metadata_doc.get(field)
630
+ return None
631
+
632
+ canonical_doc = {}
633
+ for key, field in mapping.items():
634
+ if key in allowed_keys: # only include allowed canonical keys
635
+ canonical_doc[key] = resolve_field(field)
636
+
637
+ # optionally add global fields from config
638
+ canonical_doc["scripture_name"] = config.get("name")
639
+ canonical_doc["scripture_title"] = config.get("title")
640
+ canonical_doc["source"] = config.get("source")
641
+ canonical_doc["language"] = config.get("language")
642
+ canonical_doc["unit"] = config.get("unit")
643
+ canonical_doc["document"] = document_text
644
+ if canonical_doc["text"] == "-" or canonical_doc["text"] is None:
645
+ canonical_doc["text"] = canonical_doc["document"]
646
+ canonical_doc["verse"] = resolve_field("verse")
647
+
648
+ return canonical_doc
db.py CHANGED
@@ -112,6 +112,50 @@ class SanatanDatabase:
112
  n_results=n_results,
113
  )
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def search_semantic(
116
  self,
117
  collection_name: str,
 
112
  n_results=n_results,
113
  )
114
 
115
+ def fetch_document_by_index(self, collection_name: str, index: int, unit_name : str):
116
+ """
117
+ Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
118
+
119
+ Args:
120
+ collection_name: Name of the ChromaDB collection.
121
+ index: Zero-based index of the document to fetch.
122
+
123
+ Returns:
124
+ dict: {
125
+ "document": <document_text>,
126
+ <metadata_key_1>: <value>,
127
+ <metadata_key_2>: <value>,
128
+ ...
129
+ }
130
+ Or a dict with "error" key if something went wrong.
131
+ """
132
+ logger.info("Fetch document #%d from [%s]", index, collection_name)
133
+ collection = self.chroma_client.get_or_create_collection(name=collection_name)
134
+
135
+ try:
136
+ response = collection.get(
137
+ limit=1,
138
+ # offset=index, # pagination via offset
139
+ include=["metadatas", "documents"],
140
+ where={unit_name: index}
141
+ )
142
+ except Exception as e:
143
+ logger.error("Error fetching document: %s", e)
144
+ return {"error": f"There was an error fetching the document: {str(e)}"}
145
+
146
+ documents = response.get("documents", [])
147
+ metadatas = response.get("metadatas", [])
148
+
149
+ if documents:
150
+ # merge document text with metadata
151
+ result = {"document": documents[0]}
152
+ if metadatas:
153
+ result.update(metadatas[0])
154
+ return result
155
+ else:
156
+ return {"error": "No data available."}
157
+
158
+
159
  def search_semantic(
160
  self,
161
  collection_name: str,
main.py CHANGED
@@ -31,4 +31,4 @@ async def log_requests(request: Request, call_next):
31
  return response
32
 
33
  if __name__ == "__main__":
34
- uvicorn.run("main:app", host="0.0.0.0", port=7860)
 
31
  return response
32
 
33
  if __name__ == "__main__":
34
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
server.py CHANGED
@@ -180,10 +180,75 @@ async def handle_quiz_eval(payload: QuizEvalPayload, request: Request):
180
  print(result.model_dump_json(indent=1))
181
  return result
182
 
 
183
  @router.get("/scriptures")
184
  async def handle_get_scriptures():
185
  return_values = {}
186
  for scripture in SanatanConfig().scriptures:
187
- if scripture['collection_name'] != "yt_metadata":
188
- return_values[scripture['collection_name']] = scripture['title']
189
  return return_values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  print(result.model_dump_json(indent=1))
181
  return result
182
 
183
+
184
  @router.get("/scriptures")
185
  async def handle_get_scriptures():
186
  return_values = {}
187
  for scripture in SanatanConfig().scriptures:
188
+ if scripture["collection_name"] != "yt_metadata":
189
+ return_values[scripture["collection_name"]] = scripture["title"]
190
  return return_values
191
+
192
+
193
+ class ScriptureRequest(BaseModel):
194
+ scripture_name: str
195
+ unit_index: int
196
+
197
+
198
+ @router.post("/scripture")
199
+ async def get_scripture(req: ScriptureRequest):
200
+ """
201
+ Return a scripture unit (page or verse, based on config),
202
+ including all metadata fields separately.
203
+ """
204
+ print("received request to fetch scripture.", req)
205
+
206
+ # find config entry for the scripture
207
+ config = next(
208
+ (s for s in SanatanConfig().scriptures if s["name"] == req.scripture_name), None
209
+ )
210
+ if not config:
211
+ return {"error": f"Scripture '{req.scripture_name}' not found"}
212
+
213
+ # fetch the raw document from DB
214
+ raw_doc = SanatanDatabase().fetch_document_by_index(
215
+ collection_name=config["collection_name"],
216
+ index=req.unit_index,
217
+ unit_name=config["unit"]
218
+ )
219
+
220
+ if not raw_doc or isinstance(raw_doc, str):
221
+ return {"error": f"No data available for unit {req.unit_index}"}
222
+
223
+ # canonicalize it
224
+ canonical_doc = SanatanConfig().canonicalize_document(
225
+ scripture_name=req.scripture_name,
226
+ document_text=raw_doc.get("document", ""),
227
+ metadata_doc=raw_doc,
228
+ )
229
+
230
+ # add unit index & total units (so Flutter can paginate)
231
+ canonical_doc["unit_index"] = req.unit_index
232
+ canonical_doc["total"] = SanatanDatabase().count(config["collection_name"])
233
+
234
+ print("canonical_doc = ", canonical_doc)
235
+ return canonical_doc
236
+
237
+
238
+ @router.get("/scripture_configs")
239
+ async def get_scripture_configs():
240
+ scriptures = []
241
+ for s in SanatanConfig().scriptures:
242
+ num_units = SanatanDatabase().count(
243
+ collection_name=s["collection_name"]
244
+ )
245
+
246
+ scriptures.append(
247
+ {
248
+ "name": s["name"], # e.g. "bhagavad_gita"
249
+ "title": s["title"], # e.g. "Bhagavad Gita"
250
+ "unit": s["unit"], # e.g. "verse" or "page"
251
+ "total" : num_units
252
+ }
253
+ )
254
+ return {"scriptures": scriptures}