import re import openai import pandas as pd import streamlit_scrollable_textbox as stx import torch from InstructorEmbedding import INSTRUCTOR from gradio_client import Client from transformers import ( AutoModelForMaskedLM, AutoTokenizer, ) from rank_bm25 import BM25Okapi, BM25L, BM25Plus import numpy as np from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer import re import streamlit as st @st.cache_resource def get_data(): data = pd.read_csv("earnings_calls_cleaned_metadata_keywords_indices.csv") return data # Preprocessing for BM25 def tokenizer( string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}" ): regex = reg string = string.replace("-", " ") return " ".join(re.findall(regex, string)) def preprocess_text(text): # Convert to lowercase text = text.lower() # Tokenize the text tokens = word_tokenize(text) # Remove stop words stop_words = set(stopwords.words("english")) tokens = [token for token in tokens if token not in stop_words] # Stem the tokens porter_stemmer = PorterStemmer() tokens = [porter_stemmer.stem(token) for token in tokens] # Join the tokens back into a single string preprocessed_text = " ".join(tokens) preprocessed_text = tokenizer(preprocessed_text) return preprocessed_text # Initialize models from HuggingFace @st.cache_resource def get_splade_sparse_embedding_model(): model_sparse = "naver/splade-cocondenser-ensembledistil" # check device device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_sparse) model_sparse = AutoModelForMaskedLM.from_pretrained(model_sparse) # move to gpu if available model_sparse.to(device) return model_sparse, tokenizer @st.cache_resource def get_instructor_embedding_model(): model = INSTRUCTOR("hkunlp/instructor-xl") return model @st.cache_resource def get_instructor_embedding_model_api(): client = Client("https://awinml-api-instructor-xl-2.hf.space/") return client @st.cache_resource def get_alpaca_model(): client = Client("https://awinml-alpaca-cpp.hf.space") return client @st.cache_resource def get_vicuna_ner_1_model(): client = Client("https://awinml-api-vicuna-openblas-ner-1.hf.space/") return client @st.cache_resource def get_vicuna_ner_2_model(): client = Client("https://awinml-api-vicuna-openblas-ner-2.hf.space/") return client @st.cache_resource def get_vicuna_text_gen_model(): client = Client("https://awinml-api-vicuna-openblas-4.hf.space/") return client def get_bm25_model(data): corpus = data.Text.tolist() corpus_clean = [preprocess_text(x) for x in corpus] tokenized_corpus = [doc.split(" ") for doc in corpus_clean] bm25 = BM25Plus(tokenized_corpus) return corpus, bm25 @st.cache_resource def save_key(api_key): return api_key # Text Generation def vicuna_text_generate(prompt, model): generated_text = model.predict(prompt, api_name="/predict") return generated_text def gpt_turbo_model(prompt): response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": prompt}, ], temperature=0.01, max_tokens=1024, ) return response["choices"][0]["message"]["content"]