File size: 6,401 Bytes
38116a9 5e8c89b 38116a9 34202a3 38116a9 b605bba 34202a3 8f7bb50 34202a3 b605bba 34202a3 b605bba 05dab99 38116a9 b605bba 9fcb0ad 4095e7a b605bba 4095e7a 4f5f3f9 4095e7a b605bba 4095e7a b605bba 645b14b c78e3ff 9fcb0ad b605bba 56e2fd6 b605bba 56e2fd6 b605bba 56e2fd6 b605bba 56e2fd6 9fcb0ad 38116a9 9fcb0ad 971f291 4095e7a 9fcb0ad 38116a9 9fcb0ad 38116a9 56e2fd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
import pandas as pd
import re
# Load and parse the CSV file from Hugging Face
def load_data():
url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
lemmas = {}
current_lemma = None
for row in df.itertuples(index=False, name=None):
if len(row) < 5:
print(f"Skipping problematic line: {row}")
continue
orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row)
if orto == '---':
current_lemma = None
elif current_lemma is None:
current_lemma = orto.replace("ORTO:", "")
lemmas[current_lemma] = []
lemma_data = {
'word': current_lemma,
'PPOS': ppos.replace("PPOS:", "") if ppos else "",
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
'COMM': comm if comm else "",
'pronunciations': pronunciations
}
lemmas[current_lemma].append(lemma_data)
else:
lemma_data = {
'word': orto.replace("ORTO:", "") if orto else "",
'PPOS': ppos.replace("PPOS:", "") if ppos else "",
'PHON1': phon1.replace("PHON:", "") if phon1 else "",
'PHON2': phon2.replace("PHON:", "") if phon2 else "",
'COMM': comm if comm else "",
'pronunciations': pronunciations
}
lemmas[current_lemma].append(lemma_data)
print("Loaded lemmas:", lemmas) # Debugging output
return lemmas
lemmas = load_data()
def expand_ppos(ppos):
matches = re.findall(r'\[([^\]]+)\]', ppos)
if matches:
expanded = []
for match in matches[0]:
expanded.append(ppos.replace(f'[{matches[0]}]', match))
return expanded
else:
return [ppos]
def create_noun_table(lemma, forms):
table_data = {
'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
}
for form in forms:
ppos = form['PPOS'].lower() # Normalize to lowercase
word = form['word']
print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
expanded_ppos_list = expand_ppos(ppos)
for expanded_ppos in expanded_ppos_list:
key = expanded_ppos
if key in table_data:
table_data[key] = word
else:
print(f"Unmatched key: {key} for word: {word} with PPOS: {ppos}")
print(f"Final table data for {lemma}: {table_data}") # Debugging output
table = f"""
<table border="1">
<thead>
<tr>
<th colspan="2">Eintal</th>
<th colspan="2">Fleirtal</th>
</tr>
<tr>
<th>Óbundið</th>
<th>Bundið</th>
<th>Óbundið</th>
<th>Bundið</th>
</tr>
</thead>
<tbody>
<tr>
<td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
<td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
<td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
<td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
<td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
<td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
<td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
<td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
<td>{table_data['ncmpd==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
<td>{table_data['ncmpd==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
</tr>
<tr>
<td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
<td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
<td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
<td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
</tr>
</tbody>
</table>
"""
return table
def search_lemma(lemma):
results = lemmas.get(lemma, None)
if not results:
return f"No results found for {lemma}"
if 'n' in results[0]['PPOS'].lower():
table = create_noun_table(lemma, results)
else:
table = "Only noun tables are currently supported."
return table
iface = gr.Interface(
fn=search_lemma,
inputs="text",
outputs="html",
title="Lemma Search",
description="Enter a lemma to search for its declensions and pronunciations."
)
if __name__ == "__main__":
iface.launch()
|