File size: 2,423 Bytes
2609fac
 
 
 
 
 
 
 
 
41e198b
 
2609fac
 
 
 
 
41e198b
 
2609fac
 
 
 
 
41e198b
 
 
2609fac
 
 
85c4b9b
 
2609fac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08192b3
 
2609fac
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
import spacy
import gensim
import streamlit as st
from transformers import pipeline


@st.cache_resource
def load_w2v(model_path):
    with st.spinner('Загружаю языковую модель'):
        _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
    return _w2v_model


@st.cache_resource
def load_spacy():
    with st.spinner('Загружаю морфо-синтаксический парсер'):
        _nlp = spacy.load('es_core_news_lg')
    return _nlp


@st.cache_resource
def load_bert():
    with st.spinner('Загружаю языковую модель'):
        _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
    return _pipeline


nlp = load_spacy()
w2v_model_1_path = r'model1.gz'
w2v_model_2_path = r'model2.gz'

# Upload minimums
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'lexical_minimums/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
minimums_paths = (a1_path, a2_path, b1_path, b2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
    with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
        for line in read_file:
            minimums_sets[i].add(line.strip())

a1_distractor_set = a1_target_set
a2_distractor_set = a2_target_set.union(a1_target_set)
b1_distractor_set = b1_target_set.union(a2_target_set)
b2_distractor_set = b2_target_set.union(b1_target_set)
c1_distractor_set = c1_target_set.union(b2_target_set)
c2_distractor_set = c2_target_set.union(c1_target_set)

with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
    PHRASES = set(json.load(f)['PHRASES'])

with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f:
    FIX_LEMMA = json.load(f)

SIMILARITY_VALUES = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}

BAD_USER_TARGET_WORDS = []