Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files- data/azhwars.json +5 -0
- db.py +17 -2
- modules/config/divya_prabandham_taniyans.py +16 -12
- nalayiram_helper.py +36 -2
- tests/test_divya_prabandham_verse_fix.py +9 -3
data/azhwars.json
CHANGED
|
@@ -128,5 +128,10 @@
|
|
| 128 |
"RNA",
|
| 129 |
"Thiruvarangathu Amutanar",
|
| 130 |
"Iramanusa Nootranthathi"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
]
|
| 132 |
]
|
|
|
|
| 128 |
"RNA",
|
| 129 |
"Thiruvarangathu Amutanar",
|
| 130 |
"Iramanusa Nootranthathi"
|
| 131 |
+
],
|
| 132 |
+
[
|
| 133 |
+
"taniyan",
|
| 134 |
+
"NA",
|
| 135 |
+
"Common"
|
| 136 |
]
|
| 137 |
]
|
db.py
CHANGED
|
@@ -208,7 +208,7 @@ class SanatanDatabase:
|
|
| 208 |
# If the conversion returns an empty dict, treat it as None
|
| 209 |
if isinstance(where_clause, dict) and not where_clause:
|
| 210 |
where_clause = None
|
| 211 |
-
|
| 212 |
# First, try strict filter
|
| 213 |
data = collection.get(include=["metadatas", "documents"], where=where_clause)
|
| 214 |
|
|
@@ -751,9 +751,19 @@ class SanatanDatabase:
|
|
| 751 |
df["_id"] = ids
|
| 752 |
df["_doc"] = documents
|
| 753 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
# Add sortable columns for each unique field
|
| 755 |
for field_name in unique_fields:
|
| 756 |
-
if field_name.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
# Map chapter names to their defined order
|
| 758 |
df["_sort_" + field_name] = (
|
| 759 |
df[field_name].map(chapter_order_mapping).fillna(np.inf)
|
|
@@ -773,6 +783,11 @@ class SanatanDatabase:
|
|
| 773 |
df["_sort_" + field_name] = df[field_name].apply(parse_val)
|
| 774 |
|
| 775 |
sort_cols = ["_sort_" + f for f in unique_fields]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
|
| 777 |
|
| 778 |
# Assign global index
|
|
|
|
| 208 |
# If the conversion returns an empty dict, treat it as None
|
| 209 |
if isinstance(where_clause, dict) and not where_clause:
|
| 210 |
where_clause = None
|
| 211 |
+
|
| 212 |
# First, try strict filter
|
| 213 |
data = collection.get(include=["metadatas", "documents"], where=where_clause)
|
| 214 |
|
|
|
|
| 751 |
df["_id"] = ids
|
| 752 |
df["_doc"] = documents
|
| 753 |
|
| 754 |
+
logger.info(
|
| 755 |
+
"build_global_index_for_all_scriptures:%s:unique_fields: %s",
|
| 756 |
+
scripture_name,
|
| 757 |
+
unique_fields,
|
| 758 |
+
)
|
| 759 |
+
|
| 760 |
# Add sortable columns for each unique field
|
| 761 |
for field_name in unique_fields:
|
| 762 |
+
if field_name.lower() in ("chapter","prabandham_name") and chapter_order_mapping:
|
| 763 |
+
logger.info(
|
| 764 |
+
"build_global_index_for_all_scriptures:%s:sorting",
|
| 765 |
+
scripture_name,
|
| 766 |
+
)
|
| 767 |
# Map chapter names to their defined order
|
| 768 |
df["_sort_" + field_name] = (
|
| 769 |
df[field_name].map(chapter_order_mapping).fillna(np.inf)
|
|
|
|
| 783 |
df["_sort_" + field_name] = df[field_name].apply(parse_val)
|
| 784 |
|
| 785 |
sort_cols = ["_sort_" + f for f in unique_fields]
|
| 786 |
+
logger.info(
|
| 787 |
+
"build_global_index_for_all_scriptures:%s:sort_cols=%s",
|
| 788 |
+
scripture_name,
|
| 789 |
+
sort_cols
|
| 790 |
+
)
|
| 791 |
df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
|
| 792 |
|
| 793 |
# Assign global index
|
modules/config/divya_prabandham_taniyans.py
CHANGED
|
@@ -14,6 +14,7 @@ divya_prabandham_taniyans_config = {
|
|
| 14 |
"collection_embedding_fn": "openai",
|
| 15 |
"unit": "taniyan",
|
| 16 |
"unit_field": "verse",
|
|
|
|
| 17 |
"field_mapping": {
|
| 18 |
"text": "pasuram_ta",
|
| 19 |
"title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
|
|
@@ -23,7 +24,6 @@ divya_prabandham_taniyans_config = {
|
|
| 23 |
"transliteration": "pasuram_en",
|
| 24 |
"reference_link": "html_url",
|
| 25 |
"author": "author",
|
| 26 |
-
# "chapter_name": "prabandham_name",
|
| 27 |
"relative_path": lambda doc: "-".join(
|
| 28 |
filter(
|
| 29 |
None,
|
|
@@ -32,13 +32,6 @@ divya_prabandham_taniyans_config = {
|
|
| 32 |
),
|
| 33 |
},
|
| 34 |
"metadata_fields": [
|
| 35 |
-
{
|
| 36 |
-
"name": "prabandham_code",
|
| 37 |
-
"label": "Prabandham Code",
|
| 38 |
-
"datatype": "str",
|
| 39 |
-
"description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
|
| 40 |
-
"is_unique": True,
|
| 41 |
-
},
|
| 42 |
{
|
| 43 |
"name": "prabandham_name",
|
| 44 |
"label": "Prabandham Name",
|
|
@@ -46,10 +39,21 @@ divya_prabandham_taniyans_config = {
|
|
| 46 |
"description": "contains the prabandham name. e.g. `Thiruppallandu`",
|
| 47 |
"show_as_filter": True,
|
| 48 |
"component": "dropdown",
|
| 49 |
-
"lov": lambda:
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
"is_unique": True,
|
| 54 |
},
|
| 55 |
{
|
|
|
|
| 14 |
"collection_embedding_fn": "openai",
|
| 15 |
"unit": "taniyan",
|
| 16 |
"unit_field": "verse",
|
| 17 |
+
"chapter_order": lambda: nalayiram_helper.get_prabandham_chapter_order_mapping(),
|
| 18 |
"field_mapping": {
|
| 19 |
"text": "pasuram_ta",
|
| 20 |
"title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
|
|
|
|
| 24 |
"transliteration": "pasuram_en",
|
| 25 |
"reference_link": "html_url",
|
| 26 |
"author": "author",
|
|
|
|
| 27 |
"relative_path": lambda doc: "-".join(
|
| 28 |
filter(
|
| 29 |
None,
|
|
|
|
| 32 |
),
|
| 33 |
},
|
| 34 |
"metadata_fields": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
{
|
| 36 |
"name": "prabandham_name",
|
| 37 |
"label": "Prabandham Name",
|
|
|
|
| 39 |
"description": "contains the prabandham name. e.g. `Thiruppallandu`",
|
| 40 |
"show_as_filter": True,
|
| 41 |
"component": "dropdown",
|
| 42 |
+
"lov": lambda: list(
|
| 43 |
+
set(
|
| 44 |
+
[
|
| 45 |
+
p.prabandham_name
|
| 46 |
+
for p in nalayiram_helper.get_standardized_prabandham_names()
|
| 47 |
+
]
|
| 48 |
+
)
|
| 49 |
+
),
|
| 50 |
+
"is_unique": True,
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "prabandham_code",
|
| 54 |
+
"label": "Prabandham Code",
|
| 55 |
+
"datatype": "str",
|
| 56 |
+
"description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
|
| 57 |
"is_unique": True,
|
| 58 |
},
|
| 59 |
{
|
nalayiram_helper.py
CHANGED
|
@@ -71,6 +71,7 @@ def get_standardized_divya_desam_names() -> list[str]:
|
|
| 71 |
]
|
| 72 |
return sorted(set([row["title"] for row in data]))
|
| 73 |
|
|
|
|
| 74 |
def reorder_taniyan(collection):
|
| 75 |
logger.info("reorder_taniyan: started")
|
| 76 |
|
|
@@ -160,7 +161,7 @@ def reorder_taniyan(collection):
|
|
| 160 |
|
| 161 |
|
| 162 |
def delete_taniyan(collection):
|
| 163 |
-
logger.info("delete_taniyan: started")
|
| 164 |
|
| 165 |
# Fetch all docs (only ids + metadata needed)
|
| 166 |
data = collection.get(include=["metadatas"])
|
|
@@ -169,7 +170,8 @@ def delete_taniyan(collection):
|
|
| 169 |
|
| 170 |
# Collect ids where section_type starts with "taniyan"
|
| 171 |
taniyan_ids = [
|
| 172 |
-
ids[i]
|
|
|
|
| 173 |
if meta.get("section_type", "").startswith("taniyan")
|
| 174 |
]
|
| 175 |
|
|
@@ -183,5 +185,37 @@ def delete_taniyan(collection):
|
|
| 183 |
logger.info("delete_taniyan: finished")
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
if __name__ == "__main__":
|
| 187 |
logger.info(get_standardized_azhwar_names())
|
|
|
|
| 71 |
]
|
| 72 |
return sorted(set([row["title"] for row in data]))
|
| 73 |
|
| 74 |
+
|
| 75 |
def reorder_taniyan(collection):
|
| 76 |
logger.info("reorder_taniyan: started")
|
| 77 |
|
|
|
|
| 161 |
|
| 162 |
|
| 163 |
def delete_taniyan(collection):
|
| 164 |
+
logger.info("delete_taniyan: started")
|
| 165 |
|
| 166 |
# Fetch all docs (only ids + metadata needed)
|
| 167 |
data = collection.get(include=["metadatas"])
|
|
|
|
| 170 |
|
| 171 |
# Collect ids where section_type starts with "taniyan"
|
| 172 |
taniyan_ids = [
|
| 173 |
+
ids[i]
|
| 174 |
+
for i, meta in enumerate(metas)
|
| 175 |
if meta.get("section_type", "").startswith("taniyan")
|
| 176 |
]
|
| 177 |
|
|
|
|
| 185 |
logger.info("delete_taniyan: finished")
|
| 186 |
|
| 187 |
|
| 188 |
+
def get_prabandham_chapter_order_mapping():
|
| 189 |
+
chapter_names = [
|
| 190 |
+
"Common",
|
| 191 |
+
"Thiruppallāṇḍu",
|
| 192 |
+
"Periyazvar Thirumozhi",
|
| 193 |
+
"Thiruppavai",
|
| 194 |
+
"Nachiyar Thirumozhi",
|
| 195 |
+
"Perumal Thirumozhi",
|
| 196 |
+
"Thiruchandavirutham",
|
| 197 |
+
"Thirumalai",
|
| 198 |
+
"Thirupalliezhuchi",
|
| 199 |
+
"Amalanadipiran",
|
| 200 |
+
"Kanninunchiruthambu",
|
| 201 |
+
"Periya Thirumozhi",
|
| 202 |
+
"Thirukurunthandakam",
|
| 203 |
+
"Thirunedumthandakam",
|
| 204 |
+
"Muthal Thiruvanthathi",
|
| 205 |
+
"Irandam Thiruvanthathi",
|
| 206 |
+
"Moonram Thiruvanthathi",
|
| 207 |
+
"Nanmukan Thiruvanthathi",
|
| 208 |
+
"Thiruvirutham",
|
| 209 |
+
"Thiruvasiriyam",
|
| 210 |
+
"Periya Thiruvanthathi",
|
| 211 |
+
"Thiruvezhukootrarikkai",
|
| 212 |
+
"Siriya Thirumadal",
|
| 213 |
+
"Periya Thirumadal",
|
| 214 |
+
"Thiruvaimozhi",
|
| 215 |
+
"Iramanusa Nootranthathi",
|
| 216 |
+
]
|
| 217 |
+
section_dict = {name: i + 1 for i, name in enumerate(chapter_names)}
|
| 218 |
+
return section_dict
|
| 219 |
+
|
| 220 |
if __name__ == "__main__":
|
| 221 |
logger.info(get_standardized_azhwar_names())
|
tests/test_divya_prabandham_verse_fix.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
|
|
|
|
| 4 |
from db import SanatanDatabase
|
| 5 |
from metadata import MetadataFilter, MetadataWhereClause
|
| 6 |
|
| 7 |
|
| 8 |
if __name__ == "__main__":
|
| 9 |
logging.basicConfig()
|
| 10 |
-
collection_name = "
|
| 11 |
database = SanatanDatabase()
|
| 12 |
-
database.delete_taniyans_in_divya_prabandham()
|
| 13 |
-
database.fix_taniyans_in_divya_prabandham()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
|
| 4 |
+
from config import SanatanConfig
|
| 5 |
from db import SanatanDatabase
|
| 6 |
from metadata import MetadataFilter, MetadataWhereClause
|
| 7 |
|
| 8 |
|
| 9 |
if __name__ == "__main__":
|
| 10 |
logging.basicConfig()
|
| 11 |
+
collection_name = "divya_prabandham_taniyans"
|
| 12 |
database = SanatanDatabase()
|
| 13 |
+
# database.delete_taniyans_in_divya_prabandham()
|
| 14 |
+
# database.fix_taniyans_in_divya_prabandham()
|
| 15 |
+
config = SanatanConfig()
|
| 16 |
+
c = config.get_scripture_by_name("divya_prabandham_taniyans")
|
| 17 |
+
database.build_global_index_for_scripture(c)
|
| 18 |
+
results = database.get(collection_name,None,n_results=2)
|
| 19 |
+
print(results)
|