Spaces:
Sleeping
Sleeping
case insensitive wiktionary search
Browse files
app.py
CHANGED
|
@@ -1410,26 +1410,35 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1410 |
return report
|
| 1411 |
|
| 1412 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1413 |
-
""" Finds all entries related to an English word. """
|
| 1414 |
log(f"Wiktionary (EN): Querying for '{word}'...")
|
| 1415 |
found_entry_ids: Set[int] = set()
|
| 1416 |
|
| 1417 |
lang_query = 'English'
|
| 1418 |
form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
|
| 1419 |
|
| 1420 |
-
|
| 1421 |
-
|
| 1422 |
-
).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1423 |
|
| 1424 |
parent_lemmas_to_find: Set[str] = set()
|
| 1425 |
|
| 1426 |
for row in lemma_q:
|
| 1427 |
entry_id = row["id"]
|
| 1428 |
-
pos_title = row["pos_title"]
|
| 1429 |
found_entry_ids.add(entry_id)
|
| 1430 |
|
| 1431 |
-
|
| 1432 |
-
|
|
|
|
| 1433 |
form_of_q = conn.execute(
|
| 1434 |
"SELECT form_of FROM senses WHERE entry_id = ?", (entry_id,)
|
| 1435 |
).fetchall()
|
|
@@ -1444,30 +1453,35 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1444 |
if parent_lemma_word:
|
| 1445 |
parent_lemmas_to_find.add(parent_lemma_word)
|
| 1446 |
except json.JSONDecodeError:
|
| 1447 |
-
|
| 1448 |
|
| 1449 |
-
|
| 1450 |
-
|
|
|
|
| 1451 |
SELECT DISTINCT e.id
|
| 1452 |
FROM forms f
|
| 1453 |
JOIN entries e ON f.entry_id = e.id
|
| 1454 |
-
WHERE f.form_text
|
| 1455 |
AND f.id NOT IN (
|
| 1456 |
SELECT ft.form_id
|
| 1457 |
FROM form_tags ft
|
| 1458 |
JOIN tags t ON ft.tag_id = t.id
|
| 1459 |
WHERE t.tag IN ('variant', 'auxiliary')
|
| 1460 |
)
|
| 1461 |
-
|
| 1462 |
-
|
|
|
|
|
|
|
| 1463 |
for row in form_q:
|
| 1464 |
found_entry_ids.add(row["id"])
|
| 1465 |
|
|
|
|
| 1466 |
if parent_lemmas_to_find:
|
| 1467 |
log(f"Wiktionary: Found parent lemmas to add: {parent_lemmas_to_find}")
|
| 1468 |
for lemma_word in parent_lemmas_to_find:
|
|
|
|
| 1469 |
parent_id_q = conn.execute(
|
| 1470 |
-
|
| 1471 |
).fetchall()
|
| 1472 |
for row in parent_id_q:
|
| 1473 |
found_entry_ids.add(row["id"])
|
|
|
|
| 1410 |
return report
|
| 1411 |
|
| 1412 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1413 |
+
""" Finds all entries related to an English word (Case-Insensitive Search). """
|
| 1414 |
log(f"Wiktionary (EN): Querying for '{word}'...")
|
| 1415 |
found_entry_ids: Set[int] = set()
|
| 1416 |
|
| 1417 |
lang_query = 'English'
|
| 1418 |
form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
|
| 1419 |
|
| 1420 |
+
# --- FIX: Search for Original, Lowercase, and Title Case ---
|
| 1421 |
+
# This ensures we find "Ready" when input is "ready"
|
| 1422 |
+
search_variants = list(set([word, word.lower(), word.title()]))
|
| 1423 |
+
placeholders = ', '.join('?' for _ in search_variants)
|
| 1424 |
+
|
| 1425 |
+
# 1. Search Lemmatized Entries
|
| 1426 |
+
sql_lemma = f"SELECT id, pos_title, word FROM entries WHERE word IN ({placeholders}) AND lang = ?"
|
| 1427 |
+
# flatten params: [var1, var2, ..., 'English']
|
| 1428 |
+
params_lemma = list(search_variants) + [lang_query]
|
| 1429 |
+
|
| 1430 |
+
lemma_q = conn.execute(sql_lemma, params_lemma).fetchall()
|
| 1431 |
|
| 1432 |
parent_lemmas_to_find: Set[str] = set()
|
| 1433 |
|
| 1434 |
for row in lemma_q:
|
| 1435 |
entry_id = row["id"]
|
| 1436 |
+
pos_title = row["pos_title"] or "" # Safe string
|
| 1437 |
found_entry_ids.add(entry_id)
|
| 1438 |
|
| 1439 |
+
# If it's a form entry (e.g. "running"), try to find the parent ("run")
|
| 1440 |
+
if any(ft in pos_title for ft in form_titles):
|
| 1441 |
+
log(f"Wiktionary: '{row['word']}' is an inflected entry (ID {entry_id}). Looking for parent...")
|
| 1442 |
form_of_q = conn.execute(
|
| 1443 |
"SELECT form_of FROM senses WHERE entry_id = ?", (entry_id,)
|
| 1444 |
).fetchall()
|
|
|
|
| 1453 |
if parent_lemma_word:
|
| 1454 |
parent_lemmas_to_find.add(parent_lemma_word)
|
| 1455 |
except json.JSONDecodeError:
|
| 1456 |
+
pass
|
| 1457 |
|
| 1458 |
+
# 2. Search Inflected Forms (Reverse lookup)
|
| 1459 |
+
# We also apply the case variants here
|
| 1460 |
+
sql_form = f"""
|
| 1461 |
SELECT DISTINCT e.id
|
| 1462 |
FROM forms f
|
| 1463 |
JOIN entries e ON f.entry_id = e.id
|
| 1464 |
+
WHERE f.form_text IN ({placeholders}) AND e.lang = ?
|
| 1465 |
AND f.id NOT IN (
|
| 1466 |
SELECT ft.form_id
|
| 1467 |
FROM form_tags ft
|
| 1468 |
JOIN tags t ON ft.tag_id = t.id
|
| 1469 |
WHERE t.tag IN ('variant', 'auxiliary')
|
| 1470 |
)
|
| 1471 |
+
"""
|
| 1472 |
+
params_form = list(search_variants) + [lang_query]
|
| 1473 |
+
|
| 1474 |
+
form_q = conn.execute(sql_form, params_form).fetchall()
|
| 1475 |
for row in form_q:
|
| 1476 |
found_entry_ids.add(row["id"])
|
| 1477 |
|
| 1478 |
+
# 3. Add Parent Lemmas (if any found in step 1)
|
| 1479 |
if parent_lemmas_to_find:
|
| 1480 |
log(f"Wiktionary: Found parent lemmas to add: {parent_lemmas_to_find}")
|
| 1481 |
for lemma_word in parent_lemmas_to_find:
|
| 1482 |
+
# Recursively check exact match for parent
|
| 1483 |
parent_id_q = conn.execute(
|
| 1484 |
+
"SELECT id FROM entries WHERE word = ? AND lang = ?", (lemma_word, lang_query)
|
| 1485 |
).fetchall()
|
| 1486 |
for row in parent_id_q:
|
| 1487 |
found_entry_ids.add(row["id"])
|