|
import json |
|
import re |
|
|
|
import openai |
|
import pandas as pd |
|
import requests |
|
import spacy |
|
import spacy_transformers |
|
import streamlit_scrollable_textbox as stx |
|
import torch |
|
from InstructorEmbedding import INSTRUCTOR |
|
from sentence_transformers import SentenceTransformer |
|
from gradio_client import Client |
|
from tqdm import tqdm |
|
from transformers import ( |
|
AutoModelForMaskedLM, |
|
AutoModelForSeq2SeqLM, |
|
AutoTokenizer, |
|
T5ForConditionalGeneration, |
|
T5Tokenizer, |
|
pipeline, |
|
) |
|
from rank_bm25 import BM25Okapi, BM25L, BM25Plus |
|
import numpy as np |
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem.porter import PorterStemmer |
|
import re |
|
import streamlit as st |
|
|
|
|
|
@st.cache_resource |
|
def get_data(): |
|
data = pd.read_csv("earnings_calls_cleaned_metadata.csv") |
|
return data |
|
|
|
|
|
|
|
|
|
|
|
def tokenizer( |
|
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}" |
|
): |
|
regex = reg |
|
string = string.replace("-", " ") |
|
return " ".join(re.findall(regex, string)) |
|
|
|
|
|
def preprocess_text(text): |
|
|
|
text = text.lower() |
|
|
|
tokens = word_tokenize(text) |
|
|
|
stop_words = set(stopwords.words("english")) |
|
tokens = [token for token in tokens if token not in stop_words] |
|
|
|
porter_stemmer = PorterStemmer() |
|
tokens = [porter_stemmer.stem(token) for token in tokens] |
|
|
|
preprocessed_text = " ".join(tokens) |
|
preprocessed_text = tokenizer(preprocessed_text) |
|
|
|
return preprocessed_text |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def get_spacy_model(): |
|
return spacy.load("en_core_web_trf") |
|
|
|
|
|
@st.cache_resource |
|
def get_flan_alpaca_xl_model(): |
|
model = AutoModelForSeq2SeqLM.from_pretrained( |
|
"/home/user/app/models/flan-alpaca-xl/" |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"/home/user/app/models/flan-alpaca-xl/" |
|
) |
|
return model, tokenizer |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def get_t5_model(): |
|
return pipeline("summarization", model="t5-small", tokenizer="t5-small") |
|
|
|
|
|
@st.cache_resource |
|
def get_flan_t5_model(): |
|
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large") |
|
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large") |
|
return model, tokenizer |
|
|
|
|
|
@st.cache_resource |
|
def get_mpnet_embedding_model(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = SentenceTransformer( |
|
"sentence-transformers/all-mpnet-base-v2", device=device |
|
) |
|
model.max_seq_length = 512 |
|
return model |
|
|
|
|
|
@st.cache_resource |
|
def get_splade_sparse_embedding_model(): |
|
model_sparse = "naver/splade-cocondenser-ensembledistil" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
tokenizer = AutoTokenizer.from_pretrained(model_sparse) |
|
model_sparse = AutoModelForMaskedLM.from_pretrained(model_sparse) |
|
|
|
model_sparse.to(device) |
|
return model_sparse, tokenizer |
|
|
|
|
|
@st.cache_resource |
|
def get_sgpt_embedding_model(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = SentenceTransformer( |
|
"Muennighoff/SGPT-125M-weightedmean-nli-bitfit", device=device |
|
) |
|
model.max_seq_length = 512 |
|
return model |
|
|
|
|
|
@st.cache_resource |
|
def get_instructor_embedding_model(): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = INSTRUCTOR("hkunlp/instructor-xl") |
|
return model |
|
|
|
@st.cache_resource |
|
def get_instructor_embedding_model_api(): |
|
client = Client("https://awinml-api-instructor-xl-2.hf.space/") |
|
return client |
|
|
|
|
|
@st.cache_resource |
|
def get_alpaca_model(): |
|
client = Client("https://awinml-alpaca-cpp.hf.space") |
|
return client |
|
|
|
|
|
@st.cache_resource |
|
def get_bm25_model(data): |
|
corpus = data.Text.tolist() |
|
corpus_clean = [preprocess_text(x) for x in corpus] |
|
tokenized_corpus = [doc.split(" ") for doc in corpus_clean] |
|
bm25 = BM25Plus(tokenized_corpus) |
|
return corpus, bm25 |
|
|
|
|
|
@st.cache_resource |
|
def save_key(api_key): |
|
return api_key |
|
|
|
|
|
|
|
|
|
|
|
def gpt_turbo_model(prompt): |
|
response = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "user", "content": prompt}, |
|
], |
|
temperature=0.01, |
|
max_tokens=1024, |
|
) |
|
return response["choices"][0]["message"]["content"] |
|
|
|
|
|
def generate_text_flan_t5(model, tokenizer, input_text): |
|
input_ids = tokenizer(input_text, return_tensors="pt").input_ids |
|
outputs = model.generate(input_ids, temperature=0.5, max_length=512) |
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
|
|
|
def generate_entities_flan_alpaca_inference_api(prompt): |
|
API_URL = "https://api-inference.huggingface.co/models/declare-lab/flan-alpaca-xl" |
|
API_TOKEN = st.secrets["hg_key"] |
|
headers = {"Authorization": f"Bearer {API_TOKEN}"} |
|
payload = { |
|
"inputs": prompt, |
|
"parameters": { |
|
"do_sample": True, |
|
"temperature": 0.1, |
|
"max_length": 80, |
|
}, |
|
"options": {"use_cache": False, "wait_for_model": True}, |
|
} |
|
try: |
|
data = json.dumps(payload) |
|
|
|
response = requests.request("POST", API_URL, data=data) |
|
output = json.loads(response.content.decode("utf-8"))[0][ |
|
"generated_text" |
|
] |
|
except: |
|
output = "" |
|
print(output) |
|
return output |
|
|
|
|
|
def generate_entities_flan_alpaca_checkpoint(model, tokenizer, prompt): |
|
model_inputs = tokenizer(prompt, return_tensors="pt") |
|
input_ids = model_inputs["input_ids"] |
|
generation_output = model.generate( |
|
input_ids=input_ids, |
|
temperature=0.1, |
|
top_p=0.5, |
|
max_new_tokens=1024, |
|
) |
|
output = tokenizer.decode(generation_output[0], skip_special_tokens=True) |
|
return output |
|
|