##Variables import os import streamlit as st import pathlib from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.chat_models.openai import ChatOpenAI from langchain import VectorDBQA import pandas as pd from langchain.chat_models import ChatOpenAI from langchain.prompts.chat import ( ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate, ) from langchain.schema import ( AIMessage, HumanMessage, SystemMessage ) from optimum.onnxruntime import ORTModelForSequenceClassification from transformers import pipeline, AutoTokenizer from optimum.pipelines import pipeline import tweepy import pandas as pd import numpy as np import plotly_express as px import plotly.graph_objects as go from datetime import datetime as dt from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode from datasets import Dataset from huggingface_hub import Repository @st.experimental_singleton(suppress_st_warning=True) def load_models(): '''load sentimant and topic clssification models''' sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id) topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id) return sent_pipe, topic_pipe @st.cache(allow_output_mutation=True, suppress_st_warning=True) def process_tweets(df,df_users): '''process tweets into a dataframe''' df['author'] = df['author'].astype(np.int64) df_merged = df.merge(df_users, on='author') tweet_list = df_merged['tweet'].tolist() sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list)) sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True) topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True) df_group = pd.concat([df_merged,sentiment,topic],axis=1) df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100) df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']] df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False) return df_tweets @st.experimental_singleton(suppress_st_warning=True) def create_vectorstore(file,model): '''Create FAISS vectorstore''' # Split tweets int chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_text(file) if model == "hkunlp/instructor-large": emb = HuggingFaceInstructEmbeddings(model_name=model, query_instruction='Represent the Financial question for retrieving supporting documents: ', embed_instruction='Represent the Financial document for retrieval: ') elif model == "sentence-transformers/all-mpnet-base-v2": emb = HuggingFaceEmbeddings(model_name=model) docsearch = FAISS.from_texts(texts, emb) return docsearch @st.experimental_singleton(suppress_st_warning=True) def embed_tweets(query,_prompt,_docsearch): '''Process file with latest tweets''' chain_type_kwargs = {"prompt": _prompt} chain = VectorDBQA.from_chain_type( ChatOpenAI(temperature=0), chain_type="stuff", vectorstore=_docsearch, chain_type_kwargs=chain_type_kwargs, return_source_documents=True, k=3 ) result = chain({"query": query}) return result CONFIG = { "bearer_token": os.environ.get("bearer_token") } sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification' topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification' task = 'text-classification' sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"} topics = { "0": "Analyst Update", "1": "Fed | Central Banks", "2": "Company | Product News", "3": "Treasuries | Corporate Debt", "4": "Dividend", "5": "Earnings", "6": "Energy | Oil", "7": "Financials", "8": "Currencies", "9": "General News | Opinion", "10": "Gold | Metals | Materials", "11": "IPO", "12": "Legal | Regulation", "13": "M&A | Investments", "14": "Macro", "15": "Markets", "16": "Politics", "17": "Personnel Change", "18": "Stock Commentary", "19": "Stock Movement", } sentiment_classifier, topic_classifier = load_models() def convert_user_names(user_name: list): '''convert user_names to tweepy format''' users = [] for user in user_name: users.append(f"from:{user}") return " OR ".join(users)