File size: 6,401 Bytes
38116a9
 
5e8c89b
38116a9
 
 
34202a3
 
38116a9
 
b605bba
34202a3
 
 
 
8f7bb50
34202a3
 
 
 
 
b605bba
 
 
 
 
 
 
 
 
34202a3
b605bba
 
 
 
 
 
 
 
 
 
05dab99
38116a9
 
 
 
b605bba
 
 
 
 
 
 
 
 
 
9fcb0ad
4095e7a
b605bba
 
 
 
 
 
 
4095e7a
4f5f3f9
4095e7a
b605bba
4095e7a
b605bba
 
 
 
 
 
 
 
645b14b
c78e3ff
 
9fcb0ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b605bba
 
 
 
56e2fd6
 
b605bba
 
 
 
56e2fd6
 
b605bba
 
 
 
56e2fd6
 
b605bba
 
 
 
56e2fd6
9fcb0ad
 
 
 
 
38116a9
 
 
 
9fcb0ad
971f291
4095e7a
9fcb0ad
 
 
 
38116a9
 
 
 
9fcb0ad
38116a9
 
 
 
 
56e2fd6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gradio as gr
import pandas as pd
import re

# Load and parse the CSV file from Hugging Face
def load_data():
    url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
    df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
    lemmas = {}
    current_lemma = None
    
    for row in df.itertuples(index=False, name=None):
        if len(row) < 5:
            print(f"Skipping problematic line: {row}")
            continue
        orto, ppos, phon1, phon2, comm, *pronunciations = map(lambda x: x if isinstance(x, str) else "", row)
        if orto == '---':
            current_lemma = None
        elif current_lemma is None:
            current_lemma = orto.replace("ORTO:", "")
            lemmas[current_lemma] = []
            lemma_data = {
                'word': current_lemma,
                'PPOS': ppos.replace("PPOS:", "") if ppos else "",
                'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                'COMM': comm if comm else "",
                'pronunciations': pronunciations
            }
            lemmas[current_lemma].append(lemma_data)
        else:
            lemma_data = {
                'word': orto.replace("ORTO:", "") if orto else "",
                'PPOS': ppos.replace("PPOS:", "") if ppos else "",
                'PHON1': phon1.replace("PHON:", "") if phon1 else "",
                'PHON2': phon2.replace("PHON:", "") if phon2 else "",
                'COMM': comm if comm else "",
                'pronunciations': pronunciations
            }
            lemmas[current_lemma].append(lemma_data)
    
    print("Loaded lemmas:", lemmas)  # Debugging output
    return lemmas

lemmas = load_data()

def expand_ppos(ppos):
    matches = re.findall(r'\[([^\]]+)\]', ppos)
    if matches:
        expanded = []
        for match in matches[0]:
            expanded.append(ppos.replace(f'[{matches[0]}]', match))
        return expanded
    else:
        return [ppos]

def create_noun_table(lemma, forms):
    table_data = {
        'ncmsn==iuu': '', 'ncmsn==duu': '', 'ncfsn==iuu': '', 'ncfsn==duu': '', 'ncnsn==iuu': '', 'ncnsn==duu': '',
        'ncmsa==iuu': '', 'ncmsa==duu': '', 'ncfsa==iuu': '', 'ncfsa==duu': '', 'ncnsa==iuu': '', 'ncnsa==duu': '',
        'ncmsd==iuu': '', 'ncmsd==duu': '', 'ncfsd==iuu': '', 'ncfsd==duu': '', 'ncnsd==iuu': '', 'ncnsd==duu': '',
        'ncmsg==iou': '', 'ncmsg==dou': '', 'ncfsg==iou': '', 'ncfsg==dou': '', 'ncnsg==iou': '', 'ncnsg==dou': '',
        'ncmpn==iuu': '', 'ncmpn==duu': '', 'ncfnn==iuu': '', 'ncfnn==duu': '', 'ncnnn==iuu': '', 'ncnnn==duu': '',
        'ncmpa==iuu': '', 'ncmpa==duu': '', 'ncfna==iuu': '', 'ncfna==duu': '', 'ncnna==iuu': '', 'ncnna==duu': '',
        'ncmpd==iuu': '', 'ncmpd==duu': '', 'ncmpg==iou': '', 'ncmpg==dou': '', 'ncfnn==iou': '', 'ncfnn==dou': '', 'ncnnn==iou': '', 'ncnnn==dou': ''
    }
    
    for form in forms:
        ppos = form['PPOS'].lower()  # Normalize to lowercase
        word = form['word']
        print(f"Processing: word={word}, ppos={ppos}, key={ppos}")
        expanded_ppos_list = expand_ppos(ppos)
        for expanded_ppos in expanded_ppos_list:
            key = expanded_ppos
            if key in table_data:
                table_data[key] = word
            else:
                print(f"Unmatched key: {key} for word: {word} with PPOS: {ppos}")

    print(f"Final table data for {lemma}: {table_data}")  # Debugging output

    table = f"""
    <table border="1">
        <thead>
            <tr>
                <th colspan="2">Eintal</th>
                <th colspan="2">Fleirtal</th>
            </tr>
            <tr>
                <th>Óbundið</th>
                <th>Bundið</th>
                <th>Óbundið</th>
                <th>Bundið</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>{table_data['ncmsn==iuu'] or table_data['ncfsn==iuu'] or table_data['ncnsn==iuu']}</td>
                <td>{table_data['ncmsn==duu'] or table_data['ncfsn==duu'] or table_data['ncnsn==duu']}</td>
                <td>{table_data['ncmpn==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpn==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsa==iuu'] or table_data['ncfsa==iuu'] or table_data['ncnsa==iuu']}</td>
                <td>{table_data['ncmsa==duu'] or table_data['ncfsa==duu'] or table_data['ncnsa==duu']}</td>
                <td>{table_data['ncmpa==iuu'] or table_data['ncfna==iuu'] or table_data['ncnna==iuu']}</td>
                <td>{table_data['ncmpa==duu'] or table_data['ncfna==duu'] or table_data['ncnna==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsd==iuu'] or table_data['ncfsd==iuu'] or table_data['ncnsd==iuu']}</td>
                <td>{table_data['ncmsd==duu'] or table_data['ncfsd==duu'] or table_data['ncnsd==duu']}</td>
                <td>{table_data['ncmpd==iuu'] or table_data['ncfnn==iuu'] or table_data['ncnnn==iuu']}</td>
                <td>{table_data['ncmpd==duu'] or table_data['ncfnn==duu'] or table_data['ncnnn==duu']}</td>
            </tr>
            <tr>
                <td>{table_data['ncmsg==iou'] or table_data['ncfsg==iou'] or table_data['ncnsg==iou']}</td>
                <td>{table_data['ncmsg==dou'] or table_data['ncfsg==dou'] or table_data['ncnsg==dou']}</td>
                <td>{table_data['ncmpg==iou'] or table_data['ncfnn==iou'] or table_data['ncnnn==iou']}</td>
                <td>{table_data['ncmpg==dou'] or table_data['ncfnn==dou'] or table_data['ncnnn==dou']}</td>
            </tr>
        </tbody>
    </table>
    """
    return table

def search_lemma(lemma):
    results = lemmas.get(lemma, None)
    if not results:
        return f"No results found for {lemma}"
    
    if 'n' in results[0]['PPOS'].lower():
        table = create_noun_table(lemma, results)
    else:
        table = "Only noun tables are currently supported."

    return table

iface = gr.Interface(
    fn=search_lemma,
    inputs="text",
    outputs="html",
    title="Lemma Search",
    description="Enter a lemma to search for its declensions and pronunciations."
)

if __name__ == "__main__":
    iface.launch()