import gradio as gr import json import re import sqlite3 import logging from collections import defaultdict from typing import Tuple, Dict, List from util import process_json_files from gematria import calculate_gematria from deep_translator import GoogleTranslator, exceptions from urllib.parse import quote_plus from tqdm import tqdm # Import tqdm for progress bars # Constants DATABASE_FILE = 'gematria.db' MAX_PHRASE_LENGTH = 5 # Populate database for phrases up to 5 words BATCH_SIZE = 1000 # Insert phrases into database in batches # Set up logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s') # Global variables conn: sqlite3.Connection = None translator: GoogleTranslator = None book_names: Dict[int, str] = {} gematria_cache: Dict[Tuple[int, int], List[Tuple[str, str, int, int]]] = {} translation_cache: Dict[str, str] = {} def initialize_database() -> None: """Initializes the SQLite database.""" global conn conn = sqlite3.connect(DATABASE_FILE, isolation_level=None) # Autocommit for faster insertion cursor = conn.cursor() # Create tables if they don't exist cursor.execute(''' CREATE TABLE IF NOT EXISTS results ( gematria_sum INTEGER, words TEXT, translation TEXT, book TEXT, chapter INTEGER, verse INTEGER, PRIMARY KEY (gematria_sum, words, book, chapter, verse) ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS processed_books ( book TEXT PRIMARY KEY, max_phrase_length INTEGER ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS translations ( hebrew_phrase TEXT PRIMARY KEY, english_translation TEXT ) ''') def initialize_translator() -> None: """Initializes the Google Translator.""" global translator translator = GoogleTranslator(source='iw', target='en') logging.info("Translator initialized.") def populate_database(start_book: int, end_book: int, max_phrase_length: int = 1) -> None: """Populates the database with phrases from the Tanach and their Gematria values.""" global conn, book_names logging.info(f"Populating database with books from {start_book} to {end_book}...") cursor = conn.cursor() for book_id in tqdm(range(start_book, end_book + 1), desc="Processing Books"): book_data = process_json_files(book_id, book_id) # Get data for the single book # process_json_files returns a dictionary with book_id as key, # so access the book data directly if book_id in book_data: book_data = book_data[book_id] if 'title' not in book_data or not isinstance(book_data['title'], str): logging.warning(f"Skipping book {book_id} due to missing or invalid 'title' field.") continue title = book_data['title'] book_names[book_id] = title # Check if the book is already processed for this max_phrase_length cursor.execute('''SELECT max_phrase_length FROM processed_books WHERE book = ?''', (title,)) result = cursor.fetchone() if result and result[0] >= max_phrase_length: logging.info(f"Skipping book {title}: Already processed with max_phrase_length {result[0]}") continue logging.info(f"Processing book {title} with max_phrase_length {max_phrase_length}") if 'text' not in book_data or not isinstance(book_data['text'], list): logging.warning(f"Skipping book {book_id} due to missing or invalid 'text' field.") continue chapters = book_data['text'] # Faster iteration with enumerate and list comprehension for chapter_id, chapter in enumerate(chapters): for verse_id, verse in enumerate(chapter): verse_text = flatten_text(verse) # Remove text in square brackets and non-Hebrew characters verse_text = re.sub(r'\[.*?\]', '', verse_text) verse_text = re.sub(r"[^\u05D0-\u05EA ]+", "", verse_text) verse_text = re.sub(r" +", " ", verse_text) words = verse_text.split() # Use a generator to avoid building large lists in memory for length in range(1, max_phrase_length + 1): for start in range(len(words) - length + 1): phrase_candidate = " ".join(words[start:start + length]) gematria_sum = calculate_gematria(phrase_candidate.replace(" ", "")) yield gematria_sum, phrase_candidate, title, chapter_id + 1, verse_id + 1 # Mark the book as processed with the current max_phrase_length cursor.execute(''' INSERT OR REPLACE INTO processed_books (book, max_phrase_length) VALUES (?, ?) ''', (title, max_phrase_length)) def insert_phrases_to_db(phrases: List[Tuple[int, str, str, int, int]]) -> None: """Inserts a list of phrases into the database efficiently.""" global conn cursor = conn.cursor() # Use executemany to insert multiple rows at once cursor.executemany(''' INSERT OR IGNORE INTO results (gematria_sum, words, book, chapter, verse) VALUES (?, ?, ?, ?, ?) ''', phrases) # Commit the changes outside the loop for better performance conn.commit() def get_translation(phrase: str) -> str: """Retrieves or generates the English translation of a Hebrew phrase.""" global translator, conn, translation_cache if phrase in translation_cache: return translation_cache[phrase] else: cursor = conn.cursor() cursor.execute(''' SELECT english_translation FROM translations WHERE hebrew_phrase = ? ''', (phrase,)) result = cursor.fetchone() if result and result[0]: translation = result[0] return translation else: translation = translate_and_store(phrase) cursor.execute(''' INSERT OR IGNORE INTO translations (hebrew_phrase, english_translation) VALUES (?, ?) ''', (phrase, translation)) return translation def translate_and_store(phrase: str) -> str: """Translates a Hebrew phrase to English using Google Translate and handles potential errors.""" global translator max_retries = 3 retries = 0 while retries < max_retries: try: translation = translator.translate(phrase) logging.debug(f"Translated phrase: {translation}") return translation except (exceptions.TranslationNotFound, exceptions.NotValidPayload, exceptions.ServerException, exceptions.RequestError, requests.exceptions.ConnectionError) as e: retries += 1 logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})") logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.") return "[Translation Error]" def search_gematria_in_db(gematria_sum: int, max_words: int) -> List[Tuple[str, str, int, int]]: """Searches the database for phrases with a given Gematria value and word count. Returns phrases with word count <= max_words.""" global conn cursor = conn.cursor() logging.debug(f"Searching for phrases with Gematria: {gematria_sum} and max words: {max_words}") cursor.execute(''' SELECT words, book, chapter, verse FROM results WHERE gematria_sum = ? ''', (gematria_sum,)) # Retrieve all matching phrases first results = cursor.fetchall() filtered_results = [] logging.debug(f"Found {len(results)} matching phrases before filtering.") for words, book, chapter, verse in results: # Filter by word count (including phrases with fewer words) word_count = len(words.split()) # Correctly split and count words logging.debug(f"Word count for '{words}': {word_count}") if word_count <= max_words: # Include phrases with word count <= max_words filtered_results.append((words, book, chapter, verse)) logging.debug(f"Found {len(filtered_results)} matching phrases after filtering.") return filtered_results def gematria_search_interface(phrase: str, max_words: int, show_translation: bool) -> str: """The main function for the Gradio interface.""" if not phrase.strip(): return "Please enter a phrase." global conn, book_names, gematria_cache conn = sqlite3.connect(DATABASE_FILE) cursor = conn.cursor() # Extract numbers from the input text numbers = re.findall(r'\d+', phrase) # Calculate Gematria for the remaining text (non-numbers) text_without_numbers = re.sub(r'\d+', '', phrase) phrase_gematria = calculate_gematria(text_without_numbers.replace(" ", "")) # Add sum of numbers to Gematria phrase_gematria += sum(int(number) for number in numbers) logging.info(f"Searching for phrases with Gematria: {phrase_gematria}") # Debugging output logging.debug(f"Phrase Gematria: {phrase_gematria}") logging.debug(f"Max Words: {max_words}") # Check if Gematria is in cache for the specific max_words value if (phrase_gematria, max_words) in gematria_cache: matching_phrases = gematria_cache[(phrase_gematria, max_words)] logging.debug(f"Retrieved matching phrases from cache for max_words: {max_words}.") else: # Search in the database matching_phrases = search_gematria_in_db(phrase_gematria, max_words) # Cache the results with the max_words value gematria_cache[(phrase_gematria, max_words)] = matching_phrases logging.debug(f"Retrieved matching phrases from database for max_words: {max_words}.") if not matching_phrases: return "No matching phrases found." # Sort results by book, chapter, and verse sorted_phrases = sorted(matching_phrases, key=lambda x: (int(list(book_names.keys())[list(book_names.values()).index(x[1])]), x[2], x[3])) logging.debug(f"Sorted matching phrases: {sorted_phrases}") # Group results by book results_by_book = defaultdict(list) for words, book, chapter, verse in sorted_phrases: results_by_book[book].append((words, chapter, verse)) logging.debug(f"Grouped results by book: {results_by_book}") # Format results for display results = [] results.append("
") for book, phrases in results_by_book.items(): results.append(f"

Book: {book}

") # Directly display book name for words, chapter, verse in phrases: translation = get_translation(words) if show_translation else "" link = f"https://www.biblegateway.com/passage/?search={quote_plus(book)}+{chapter}%3A{verse}&version=CJB" results.append(f"""

Chapter: {chapter}, Verse: {verse}

Hebrew Phrase: {words}

Translation: {translation}

[See on Bible Gateway]
""") results.append("
") # Close results-container div conn.close() # Add CSS styling style = """ """ return style + "\n".join(results) def flatten_text(text: List) -> str: """Helper function to flatten nested lists into a single list.""" if isinstance(text, list): return " ".join(flatten_text(item) if isinstance(item, list) else item for item in text) return text def run_app() -> None: """Initializes and launches the Gradio app.""" initialize_database() initialize_translator() # Pre-populate the database logging.info("Starting database population...") phrases_to_insert = [] # Collect phrases before inserting in bulk for max_phrase_length in range(1, MAX_PHRASE_LENGTH + 1): # Populate for phrases up to MAX_PHRASE_LENGTH words for gematria_sum, phrase, book, chapter, verse in tqdm(populate_database(1, 39, max_phrase_length=max_phrase_length), desc=f"Populating Database (Max Length: {max_phrase_length})"): # Books 1 to 39 phrases_to_insert.append((gematria_sum, phrase, book, chapter, verse)) if len(phrases_to_insert) >= BATCH_SIZE: # Insert in batches of BATCH_SIZE for efficiency insert_phrases_to_db(phrases_to_insert) phrases_to_insert = [] if phrases_to_insert: # Insert remaining phrases insert_phrases_to_db(phrases_to_insert) logging.info("Database population complete.") iface = gr.Interface( fn=gematria_search_interface, inputs=[ gr.Textbox(label="Enter word(s) or numbers (e.g., 'abc', '888' or 'abc 111 777')"), gr.Number(label="Max Word Count in Result Phrases", value=1, minimum=1, maximum=10), gr.Checkbox(label="Show Translation", value=True) ], outputs=gr.HTML(label="Results"), title="Gematria Search in Tanach", description="Search for phrases and/or numbers in the Tanach that have the same Gematria value.", live=False, allow_flagging="never" ) iface.launch() if __name__ == "__main__": run_app()