neuralworm's picture
set source languages of rigveda and tripitaka to sanskrit, set source language of torah to auto
8a905a4
import logging
logger = logging.getLogger(__name__)
import json
import os
import re
from deep_translator import GoogleTranslator
from gematria import calculate_gematria
import math
# Hebrew gematria values for relevant characters
gematria_values = {
'讗': 1, '讘': 2, '讙': 3, '讚': 4, '讛': 5, '讜': 6, '讝': 7, '讞': 8, '讟': 9,
'讬': 10, '讻': 20, '讱': 500, '诇': 30, '诪': 40, '诐': 600, '谞': 50, '谉': 700,
'住': 60, '注': 70, '驻': 80, '祝': 800, '爪': 90, '抓': 900, '拽': 100,
'专': 200, '砖': 300, '转': 400
}
# Reverse dictionary for converting gematria values back to Hebrew characters
reverse_gematria_values = {v: k for k, v in gematria_values.items()}
# Function to convert a Hebrew string to its gematria values
def string_to_gematria(s):
return [gematria_values.get(char, 0) for char in s] # Handle characters not in the dictionary
# Function to convert a single gematria value to Hebrew characters
def gematria_to_string(value):
result = []
for val in sorted(reverse_gematria_values.keys(), reverse=True):
while value >= val:
result.append(reverse_gematria_values[val])
value -= val
return ''.join(result)
# Function to calculate the average gematria values of corresponding characters and convert them to Hebrew characters
def average_gematria(str1, str2):
# Convert strings to gematria values
gematria1 = string_to_gematria(str1)
gematria2 = string_to_gematria(str2)
# Handle cases where strings have different lengths by padding with 0s
max_len = max(len(gematria1), len(gematria2))
gematria1.extend([0] * (max_len - len(gematria1)))
gematria2.extend([0] * (max_len - len(gematria2)))
# Calculate the average of corresponding gematria values and apply math.ceil
average_gematria_values = [math.ceil((g1 + g2) / 2) for g1, g2 in zip(gematria1, gematria2)]
# Convert the average gematria values back to Hebrew characters
return ''.join(gematria_to_string(val) for val in average_gematria_values)
def process_json_files(start, end, step, rounds="1", length=0, tlang="en", strip_spaces=True, strip_in_braces=True, strip_diacritics=True, average_compile=False):
base_path = "texts/torah"
translator = GoogleTranslator(source='auto', target=tlang)
results = []
for i in range(start, end + 1):
file_name = f"{base_path}/{i:02}.json"
try:
with open(file_name, 'r', encoding='utf-8') as file:
data = json.load(file)
text_blocks = data["text"]
full_text = ""
for block in text_blocks:
full_text += ' '.join(block)
clean_text = full_text
if strip_in_braces:
clean_text = re.sub(r"\[.*?\]", "", clean_text, flags=re.DOTALL)
if strip_diacritics:
clean_text = re.sub(r"[^\u05D0-\u05EA ]+", "", clean_text)
if strip_spaces:
clean_text = clean_text.replace(" ", "")
else:
clean_text = clean_text.replace(" ", " ")
clean_text = clean_text.replace(" ", " ")
clean_text = clean_text.replace(" ", " ")
text_length = len(clean_text)
selected_characters_per_round = {}
for round_num in map(int, rounds.split(',')):
# Handle cases where no characters should be selected
if not (round_num == 1 and step > text_length) and not (round_num == -1 and step > text_length):
# Corrected logic for negative rounds and step = 1
if round_num > 0:
current_position = step - 1
else:
current_position = text_length - 1 if step == 1 else text_length - step
completed_rounds = 0
selected_characters = ""
while completed_rounds < abs(round_num):
selected_characters += clean_text[current_position % text_length]
# Update current_position based on the sign of rounds
current_position += step if round_num > 0 else -step
if (round_num > 0 and current_position >= text_length * (completed_rounds + 1)) or \
(round_num < 0 and current_position < 0):
completed_rounds += 1
selected_characters_per_round[round_num] = selected_characters
if average_compile and len(selected_characters_per_round) > 1:
result_text = ""
keys = sorted(selected_characters_per_round.keys())
for i in range(len(keys) - 1):
result_text = average_gematria(selected_characters_per_round[keys[i]], selected_characters_per_round[keys[i+1]])
else:
result_text = ''.join(selected_characters_per_round.values())
if length != 0:
result_text = result_text[:length]
translated_text = translator.translate(result_text) if result_text else ""
if result_text: # Only append if result_text is not empty
results.append({
"book": f"Torah {i}.",
"title": data["title"],
"result_text": result_text,
"result_sum": calculate_gematria(result_text),
"translated_text": translated_text,
})
except FileNotFoundError:
results.append({"error": f"File {file_name} not found."})
except json.JSONDecodeError as e:
results.append({"error": f"File {file_name} could not be read as JSON: {e}"})
except KeyError as e:
results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})
return results
# Tests
test_results = [
#(process_json_files(0, 0, 21, rounds="3", length=0), "砖专拽"),
#(process_json_files(0, 0, 22, rounds="1", length=0), "转"),
#(process_json_files(0, 0, 22, rounds="3", length=0), "转转转"),
#(process_json_files(0, 0, 23, rounds="3", length=0), "讗讘讙"),
#(process_json_files(0, 0, 11, rounds="1", length=0), "讻转"),
#(process_json_files(0, 0, 2, rounds="1", length=0), "讘讚讜讞讬诇谞注爪专转"),
#(process_json_files(0, 0, 23, rounds="1", length=0), None), # Expect None, when no results
#(process_json_files(0, 0, 23, rounds="-1", length=0), None), # Expect None, when no results
#(process_json_files(0, 0, 22, rounds="-1", length=0), "讗"),
#(process_json_files(0, 0, 22, rounds="-2", length=0), "讗讗"),
#(process_json_files(0, 0, 1, rounds="-1", length=0), "转砖专拽爪驻注住谞诪诇讻讬讟讞讝讜讛讚讙讘讗"), # Reversed Hebrew alphabet
#(process_json_files(0, 0, 1, rounds="1,-1", length=0), "讗讘讙讚讛讜讝讞讟讬讻诇诪谞住注驻爪拽专砖转转砖专拽爪驻注住谞诪诇讻讬讟讞讝讜讛讚讙讘讗"), # Combined rounds
#(process_json_files(0, 0, 22, rounds="1,-1", length=0, average_compile=True), "专讗"), # average compile test (400+1) / 2 = math.ceil(200.5)=201=200+1="专讗"
]
all_tests_passed = True
for result, expected in test_results:
if expected is None: # Check if no result is expected
if not result:
logger.info(f"Test passed: Expected no results, got no results.")
else:
logger.error(f"Test failed: Expected no results, but got: {result}")
all_tests_passed = False
else:
# Check if result is not empty before accessing elements
if result:
result_text = result[0]['result_text']
if result_text == expected:
logger.info(f"Test passed: Expected '{expected}', got '{result_text}'")
else:
logger.error(f"Test failed: Expected '{expected}', but got '{result_text}'")
all_tests_passed = False
else:
logger.error(f"Test failed: Expected '{expected}', but got no results")
all_tests_passed = False
if all_tests_passed:
logger.info("All round tests passed.")