from transformers import RobertaTokenizer,pipeline import torch import nltk from nltk.tokenize import sent_tokenize from fin_readability_sustainability import BERTClass, do_predict import pandas as pd import en_core_web_sm nltk.download('punkt') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #SUSTAINABILITY STARTS tokenizer_sus = RobertaTokenizer.from_pretrained('roberta-base') model_sustain = BERTClass(2, "sustanability") model_sustain.to(device) model_sustain.load_state_dict(torch.load('sustainability_model.bin', map_location=device)['model_state_dict']) def get_sustainability(text): df = pd.DataFrame({'sentence':sent_tokenize(text)}) actual_predictions_sustainability = do_predict(model_sustain, tokenizer_sus, df) highlight = [] for sent, prob in zip(df['sentence'].values, actual_predictions_sustainability[1]): if prob>=4.384316: highlight.append((sent, 'non-sustainable')) elif prob<=1.423736: highlight.append((sent, 'sustainable')) else: highlight.append((sent, '-')) return highlight #SUSTAINABILITY ENDS ##Forward Looking Statement nlp = en_core_web_sm.load() def split_in_sentences(text): doc = nlp(text) return [str(sent).strip() for sent in doc.sents] def make_spans(text,results): results_list = [] for i in range(len(results)): results_list.append(results[i]['label']) facts_spans = [] facts_spans = list(zip(split_in_sentences(text),results_list)) return facts_spans fls_model = pipeline("text-classification", model="yiyanghkust/finbert-fls", tokenizer="yiyanghkust/finbert-fls") def fls(text): results = fls_model(split_in_sentences(text)) return make_spans(text,results)