veda_bot / utils.py
samlonka
'utils'
21eb7b6
import os
import re
import requests
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from database import execute_query
from aksharamukha import transliterate
from sentence_transformers import util
from llama_index.embeddings.nomic import NomicEmbedding
nomic_api_key = os.getenv('NOMIC_API_KEY')
#nomic embed model used for similarity scores
nomic_embed_model = NomicEmbedding(
api_key=nomic_api_key,
dimensionality=128,
model_name="nomic-embed-text-v1.5",
)
def get_list_meaning_word(word):
pada_meanings = {'pada': word,
'Monier-Williams Sanskrit-English Dictionary (1899)': [],
'Shabda-Sagara (1900)': [],
'Apte-Practical Sanskrit-English Dictionary (1890)': [],
}
url = f"https://ambuda.org/tools/dictionaries/mw,shabdasagara,apte/{word}"
try:
# Fetch HTML content
response = requests.get(url)
response.raise_for_status()
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Extracting text from different tags
divs = soup.find_all('div', class_='my-4', attrs={'x-show': 'show'})
try:
# Find all list items <li> within the specified <ul> tag
div_items_0 = divs[0].find('ul').find_all('li', class_='dict-entry mw-entry')
# Print the text content of each list item
dive_text_0 = [li_tag.get_text(strip=True) for li_tag in div_items_0]
text_0_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_0]
pada_meanings['Monier-Williams Sanskrit-English Dictionary (1899)'] = text_0_trans
except :
print("Error: Unable to find Monier-Williams Sanskrit-English Dictionary (1899) data.")
try:
div_items_1 = divs[1].find_all('div')
dive_text_1 = [item.get_text(strip=True) for item in div_items_1]
text_1_trans = [transliterate.process(src='Devanagari', tgt='IAST', txt=text) for text in dive_text_1]
pada_meanings['Shabda-Sagara (1900)'] = text_1_trans
except :
print("Error: Unable to find Shabda-Sagara (1900) data.")
try:
apte_meanings = []
for tag in divs[2].find_all('b'):
if tag.text.strip() != 'β€”':
text1 = tag.text.strip() # English text within <b> tag
sibling = tag.find_next_sibling() # Text following <b> tag
text2 = tag.next_sibling.strip() + ' ' # English text following <b> tag
while sibling.name != 'div':
if sibling.name is None: # Handling non-tag text
text2 += " "
elif sibling.name == 'span': # Handling <b> tag
IAST_text = transliterate.process(src='Devanagari', tgt='IAST', txt=sibling.text.strip())
text2 += IAST_text + ' ' + sibling.next_sibling.strip()
else:
text2 += sibling.text.strip() + ' ' + sibling.next_sibling.strip()
sibling = sibling.find_next_sibling()
apte_meanings.append(text2)
pada_meanings['Apte-Practical Sanskrit-English Dictionary (1890)'] = apte_meanings[:-1]
except:
print("Error: Unable to find Apte-Practical Sanskrit-English Dictionary (1890) data.")
except requests.exceptions.RequestException as e:
print(f"Error: Failed to fetch data from {url}. {e}")
return pada_meanings
#get similarity scores
def word_sentence_similarity(meanings, root_stem_word):
# Check if the word embeddings are not empty
if not meanings or not root_stem_word:
return None
meaning_embedding = np.array(nomic_embed_model.get_text_embedding(meanings))
all_meanings = []
word_score_pair = []
all_meanings.extend(get_list_meaning_word(root_stem_word)['Monier-Williams Sanskrit-English Dictionary (1899)'])
all_meanings.extend(get_list_meaning_word(root_stem_word)['Shabda-Sagara (1900)'])
for word_meaning in all_meanings:
root_stem_word_meaning_embedding = np.array(nomic_embed_model.get_text_embedding(word_meaning))
# Calculate cosine similarity
similarity_score = util.pytorch_cos_sim(meaning_embedding, root_stem_word_meaning_embedding).item()
word_score_pair.append((word_meaning,similarity_score))
# Sort the list in descending order based on similarity scores
sorted_word_score_pairs = sorted(word_score_pair, key=lambda x: x[1], reverse=True)
return sorted_word_score_pairs
#extract the adhibautic meaning of the mantra from the vedamantra
def extract_meaning_by_language(data_list, target_language='English'):
for data_dict in data_list:
if data_dict.get('languageName') == target_language:
return data_dict.get('mahatma', {})
return None
#mantra_json_details
def get_details_mantra_json(query):
description, data = execute_query(query)
df = pd.DataFrame(data)
df.columns = [x[0] for x in description]
mantra_json = df['mantra_json'].values[0]
cleaned_data = re.sub('<[^<]+?>', '', mantra_json)
return json.loads(cleaned_data)
def iast_process(input_text):
output_text = re.sub('[\u0951-\u0954,\u200d,\u0331]', '', input_text)
return output_text