|
|
import pandas as pd |
|
|
import streamlit as st |
|
|
|
|
|
with open('suffixes.txt', encoding='utf-8') as f: |
|
|
suffixes = [l.strip() for l in f] |
|
|
|
|
|
with open('prefixes.txt', encoding='utf-8') as f: |
|
|
prefixes = [l.strip() for l in f] |
|
|
|
|
|
def annotate_morphemes(word, prefixes=prefixes, suffixes=suffixes): |
|
|
interfixes = ('а', 'ар', 'е', 'ей', 'и', 'ич', 'л', 'о', 'у', 'ш') |
|
|
|
|
|
stack = '' |
|
|
annotation = [] |
|
|
word = list(word) |
|
|
had_ending = False |
|
|
for i in range(len(word)): |
|
|
char = word.pop() |
|
|
if char == '-': |
|
|
if stack == '': |
|
|
had_ending = True |
|
|
continue |
|
|
annotation.append({stack[::-1]: 'ending'}) |
|
|
stack = '' |
|
|
elif char == '=': |
|
|
if stack[::-1] in prefixes and annotation and (list(annotation[-1].values())[0] == 'root' or list(annotation[-1].values())[0] == 'prefix'): |
|
|
|
|
|
annotation.append({stack[::-1]: 'prefix'}) |
|
|
elif stack[::-1] in suffixes and annotation and list(annotation[-1].values())[0] not in ('root', 'prefix'): |
|
|
|
|
|
annotation.append({stack[::-1]: 'suffix'}) |
|
|
elif stack[::-1] in ('адьj', 'амт', 'ачей'): |
|
|
|
|
|
annotation.append({stack[::-1]: 'unifix'}) |
|
|
elif stack[::-1] in ('же', 'либо', 'нибудь', 'с', 'сь', 'ся', 'то', 'те') and not annotation: |
|
|
|
|
|
annotation.append({stack[::-1]: 'postfix'}) |
|
|
else: |
|
|
if annotation: |
|
|
if list(annotation[-1].values())[0] == 'ending': |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif list(annotation[-1].values())[0] == 'suffix': |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] in ('prefix', 'interfix'): |
|
|
if stack[::-1] in interfixes and list(annotation[-1].keys())[0] in interfixes: |
|
|
|
|
|
annotation.append({stack[::-1]: 'interfix'}) |
|
|
elif stack[::-1] in suffixes and list(annotation[-1].keys())[0] in interfixes: |
|
|
|
|
|
annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} |
|
|
annotation.append({stack[::-1]: 'suffix'}) |
|
|
elif list(annotation[-1].keys())[0] in interfixes: |
|
|
|
|
|
annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} |
|
|
elif stack[::-1] in interfixes: |
|
|
|
|
|
annotation.append({stack[::-1]: 'interfix'}) |
|
|
elif stack[::-1] in suffixes: |
|
|
|
|
|
annotation.append({stack[::-1]: 'suffix'}) |
|
|
else: |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif list(annotation[-1].values())[0] == 'interfix': |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif list(annotation[-1].values())[0] == 'postfix': |
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
else: |
|
|
|
|
|
annotation.append({stack[::-1]: 'unknown'}) |
|
|
else: |
|
|
if stack[::-1] in suffixes: |
|
|
|
|
|
annotation.append({stack[::-1]: 'suffix'}) |
|
|
elif had_ending: |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
else: |
|
|
|
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
stack = '' |
|
|
else: |
|
|
stack += char |
|
|
|
|
|
if stack[::-1] in prefixes: |
|
|
annotation.append({stack[::-1]: 'prefix'}) |
|
|
elif stack[::-1] in suffixes: |
|
|
annotation.append({stack[::-1]: 'suffix'}) |
|
|
else: |
|
|
if len(annotation) >=2 and list(annotation[-2].values())[0] == 'root' and list(annotation[-1].values())[0] == 'prefix': |
|
|
annotation[-1] = {list(annotation[-1].keys())[0]: 'interfix'} |
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif annotation and list(annotation[-1].values())[0] in ('ending', 'suffix', 'interfix', 'root'): |
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
elif not annotation: |
|
|
annotation.append({stack[::-1]: 'root'}) |
|
|
else: |
|
|
annotation.append({stack[::-1]: 'unknown'}) |
|
|
return [list(x.items())[0] for x in annotation[::-1]] |
|
|
|
|
|
st.set_page_config(layout='wide') |
|
|
st.header('Аннотирование морфемого состава слова') |
|
|
st.markdown('Введите разобранное по составу слово или слова (разделитель — пробел) в следующем формате.' |
|
|
'\n\nОкончание отделяется от предыдущей морфемы символом "-", остальные морфемы разделяются символом "=".' |
|
|
'\n\nНапример: "у=потребл=ениj-е", "пере=двиг=а-ть=ся быстр=о" .') |
|
|
inpt = st.text_input(label='Аннотировать морфемы в слове(-ах): ') |
|
|
if inpt == '': |
|
|
pass |
|
|
elif ' ' in inpt: |
|
|
for i, tk in enumerate(inpt.split()): |
|
|
st.dataframe(pd.DataFrame(annotate_morphemes(tk), columns=['Морфема', 'Тег']).set_index(['Морфема']), key=f'dataframe_{i}') |
|
|
else: |
|
|
st.dataframe(pd.DataFrame(annotate_morphemes(inpt), columns=['Морфема', 'Тег']).set_index(['Морфема'])) |