import json import spacy import gensim import streamlit as st from transformers import pipeline @st.cache_resource def load_w2v(model_path): _w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) return _w2v_model @st.cache_resource def load_spacy(): _nlp = spacy.load('es_core_news_lg') return _nlp @st.cache_resource def load_bert(): return pipeline("fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro") nlp = load_spacy() news_model_path = r'news_phrases_s300_cw10_mc50_w4_negative5-075_mean_e10_notshr.bin.gz' all_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz' lit_model_path = r'LITERATURA_annot_all_pos_spell_g_h_phrases_s300_cw10_mc50_w4_negative_5-075_mean_e20_shr.bin.gz' # Upload minimums a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set() a2_path, a2_target_set = r'lexical_minimums/A2_MINIMUM.txt', set() b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set() b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set() c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set() c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set() minimums_paths = (a1_path, a2_path, b1_path, b2_path) minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set) for i in range(len(minimums_paths)): with open(minimums_paths[i], 'r', encoding='utf-8') as read_file: for line in read_file: minimums_sets[i].add(line.strip()) a1_distractor_set = a1_target_set a2_distractor_set = a2_target_set.union(a1_target_set) b1_distractor_set = b1_target_set.union(a2_target_set) b2_distractor_set = b2_target_set.union(b1_target_set) c1_distractor_set = c1_target_set.union(b2_target_set) c2_distractor_set = c2_target_set.union(c1_target_set) with open('language_data/phrases.json', 'r', encoding='utf-8') as f: PHRASES = set(json.load(f)['PHRASES']) with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f: FIX_LEMMA = json.load(f) SIMILARITY_VALUES = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Нет': 1.0} SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Нет': 1.0} BAD_USER_TARGET_WORDS = []