Spaces:
Sleeping
Sleeping
| import json | |
| import spacy | |
| import gensim | |
| import streamlit as st | |
| from pickle import load | |
| from transformers import pipeline | |
| from summarizer import Summarizer | |
| from torch import cuda, device | |
| device = device('cuda' if cuda.is_available else 'cpu') | |
| def load_w2v(model): | |
| with st.spinner('Загружаю языковую модель'): | |
| if model == 'model1': | |
| model_path = r'language_data/model1.gz' | |
| else: | |
| model_path = r'language_data/model2.gz' | |
| return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) | |
| def load_spacy(): | |
| with st.spinner('Загружаю морфо-синтаксический парсер'): | |
| _nlp = spacy.load('es_core_news_lg') | |
| return _nlp | |
| def load_bert(): | |
| with st.spinner('Загружаю языковую модель'): | |
| _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro", device=device) | |
| return _pipeline | |
| def load_summarizer(): | |
| return Summarizer() | |
| def load_classifiers(model): | |
| if model == 'model1': | |
| scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle' | |
| classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle' | |
| elif model == 'model2': | |
| scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle' | |
| classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle' | |
| else: | |
| scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle' | |
| classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle' | |
| with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3): | |
| scaler = load(f1) | |
| classifier = load(f2) | |
| pos_dict = load(f3) | |
| return pos_dict, scaler, classifier | |
| nlp = load_spacy() | |
| summarization = load_summarizer() | |
| # Upload minimums | |
| a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set() | |
| a2_path, a2_target_set = r'lexical_minimums/A2_MINIMUM.txt', set() | |
| b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set() | |
| b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set() | |
| c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set() | |
| c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set() | |
| minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path) | |
| minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set) | |
| for i in range(len(minimums_paths)): | |
| with open(minimums_paths[i], 'r', encoding='utf-8') as read_file: | |
| for line in read_file: | |
| minimums_sets[i].add(line.strip()) | |
| MINIMUM_SETS = { | |
| 'A1': (a1_target_set, a1_target_set), | |
| 'A2': (a2_target_set, a2_target_set.union(a1_target_set)), | |
| 'B1': (b1_target_set, b1_target_set.union(a2_target_set)), | |
| 'B2': (b2_target_set, b2_target_set.union(b1_target_set)), | |
| 'C1': (c1_target_set, c1_target_set.union(b2_target_set)), | |
| 'C2': (c2_target_set, c2_target_set.union(c1_target_set)), | |
| 'Без уровня': (None, None) | |
| } | |
| LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4} | |
| with open('language_data/phrases.json', 'r', encoding='utf-8') as f: | |
| PHRASES = set(json.load(f)['PHRASES']) | |
| with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f: | |
| FIX_LEMMA = json.load(f) | |
| BAD_USER_TARGET_WORDS = [] | |
| COMBINE_POS = { | |
| 'simple': | |
| { | |
| 'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], }, | |
| 'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'ADJ':['NOUN'], 'NOUN': ['ADJ']}, | |
| 'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'ADJ':['NOUN'], 'NOUN': ['ADJ']}, | |
| 'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], | |
| 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']} | |
| }, | |
| 'phrase': | |
| { | |
| 'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']}, | |
| 'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], }, | |
| 'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'ADJ':['NOUN'], 'NOUN': ['ADJ']}, | |
| 'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], | |
| 'ADJ':['NOUN'], 'NOUN': ['ADJ']}, | |
| 'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], | |
| 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], | |
| 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']} | |
| }, | |
| } |