pythonlearnreal's picture
Upload folder using huggingface_hub
106478e verified
def number_to_thai_text(num, digit_by_digit=False):
# Thai numerals and place values
thai_digits = {
0: "ศูนย์", 1: "หนึ่ง", 2: "สอง", 3: "สาม", 4: "สี่",
5: "ห้า", 6: "หก", 7: "เจ็ด", 8: "แปด", 9: "เก้า"
}
thai_places = ["", "สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน"]
# Handle zero case
if num == 0:
return thai_digits[0]
# If digit_by_digit is True, read each digit separately
if digit_by_digit:
return " ".join(thai_digits[int(d)] for d in str(num))
# For very large numbers, we'll process in chunks of millions
if num >= 1000000:
millions = num // 1000000
remainder = num % 1000000
result = number_to_thai_text(millions) + "ล้าน"
if remainder > 0:
result += number_to_thai_text(remainder)
return result
# Convert number to string and reverse it for easier place value processing
num_str = str(num)
digits = [int(d) for d in num_str]
digits.reverse() # Reverse to process from units to highest place
result = []
for i, digit in enumerate(digits):
if digit == 0:
continue # Skip zeros
# Special case for tens place
if i == 1:
if digit == 1:
result.append(thai_places[i]) # "สิบ" for 10-19
elif digit == 2:
result.append("ยี่" + thai_places[i]) # "ยี่สิบ" for 20-29
else:
result.append(thai_digits[digit] + thai_places[i])
# Special case for units place
elif i == 0 and digit == 1:
if len(digits) > 1 and digits[1] in [1, 2]:
result.append("เอ็ด") # "เอ็ด" for 11, 21
else:
result.append(thai_digits[digit])
else:
result.append(thai_digits[digit] + thai_places[i])
# Reverse back and join
result.reverse()
return "".join(result)
def replace_numbers_with_thai(text):
import re
# Function to convert matched number to Thai text
def convert_match(match):
num_str = match.group(0).replace(',', '')
# Skip if the string is empty or invalid after removing commas
if not num_str or num_str == '.':
return match.group(0)
# Handle decimal numbers
if '.' in num_str:
parts = num_str.split('.')
integer_part = parts[0]
decimal_part = parts[1] if len(parts) > 1 else ''
# If integer part is empty, treat as 0
integer_value = int(integer_part) if integer_part else 0
# If integer part is too long (>7 digits), read digit by digit
if len(integer_part) > 7:
result = number_to_thai_text(integer_value, digit_by_digit=True)
else:
result = number_to_thai_text(integer_value)
# Add decimal part if it exists
if decimal_part:
result += "จุด " + " ".join(number_to_thai_text(int(d)) for d in decimal_part)
return result
# Handle integer numbers
num = int(num_str)
if len(num_str) > 7: # If number exceeds 7 digits
return number_to_thai_text(num, digit_by_digit=True)
return number_to_thai_text(num)
# Replace all numbers (with or without commas and decimals) in the text
def process_text(text):
# Split by spaces to process each word
words = text.split()
result = []
for word in words:
# Match only valid numeric strings (allowing commas and one decimal point)
if re.match(r'^[\d,]+(\.\d+)?$', word): # Valid number with optional decimal
result.append(convert_match(re.match(r'[\d,\.]+', word)))
else:
# If word contains non-numeric characters, read numbers digit-by-digit
if any(c.isdigit() for c in word):
processed = ""
num_chunk = ""
for char in word:
if char.isdigit():
num_chunk += char
else:
if num_chunk:
processed += " ".join(number_to_thai_text(int(d)) for d in num_chunk) + " "
num_chunk = ""
processed += char + " "
if num_chunk: # Handle any remaining numbers
processed += " ".join(number_to_thai_text(int(d)) for d in num_chunk)
result.append(processed.strip())
else:
result.append(word)
return " ".join(result)
return process_text(text)
# Test the functions
if __name__ == "__main__":
# Test number_to_thai_text
test_numbers = [1, 12, 500, 6450, 100000, 12345678]
for num in test_numbers:
print(f"{num:,} -> {number_to_thai_text(num)}")
# Test with decimals and mixed text
test_texts = [
"ฉันมีเงิน 500 บาท",
"ราคา 123.45 บาท",
"บ้านเลขที่ 12 34",
"วันที่ 15 08 2023",
]
for text in test_texts:
result = replace_numbers_with_thai(text)
print(f"\nOriginal: {text}")
print(f"Converted: {result}")