Spaces:
Build error
Build error
##Variables | |
import os | |
import streamlit as st | |
import pathlib | |
from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.chat_models.openai import ChatOpenAI | |
from langchain import VectorDBQA | |
import pandas as pd | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts.chat import ( | |
ChatPromptTemplate, | |
SystemMessagePromptTemplate, | |
AIMessagePromptTemplate, | |
HumanMessagePromptTemplate, | |
) | |
from langchain.schema import ( | |
AIMessage, | |
HumanMessage, | |
SystemMessage | |
) | |
from optimum.onnxruntime import ORTModelForSequenceClassification | |
from transformers import pipeline, AutoTokenizer | |
from optimum.pipelines import pipeline | |
import tweepy | |
import pandas as pd | |
import numpy as np | |
import plotly_express as px | |
import plotly.graph_objects as go | |
from datetime import datetime as dt | |
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode | |
from datasets import Dataset | |
from huggingface_hub import Repository | |
def load_models(): | |
'''load sentimant and topic clssification models''' | |
sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id) | |
topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id) | |
return sent_pipe, topic_pipe | |
def process_tweets(df,df_users): | |
'''process tweets into a dataframe''' | |
df['author'] = df['author'].astype(np.int64) | |
df_merged = df.merge(df_users, on='author') | |
tweet_list = df_merged['tweet'].tolist() | |
sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list)) | |
sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True) | |
topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True) | |
df_group = pd.concat([df_merged,sentiment,topic],axis=1) | |
df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100) | |
df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']] | |
df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False) | |
return df_tweets | |
def embed_tweets(file,model,query,prompt): | |
'''Process file with latest tweets''' | |
# Split tweets int chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
texts = text_splitter.split_text(file) | |
if model == "hkunlp/instructor-large": | |
emb = HuggingFaceInstructEmbeddings(model_name=model, | |
query_instruction='Represent the Financial question for retrieving supporting documents: ', | |
embed_instruction='Represent the Financial document for retrieval: ') | |
elif model == "sentence-transformers/all-mpnet-base-v2": | |
emb = HuggingFaceEmbeddings(model_name=model) | |
docsearch = FAISS.from_texts(texts, emb) | |
chain_type_kwargs = {"prompt": prompt} | |
chain = VectorDBQA.from_chain_type( | |
ChatOpenAI(temperature=0), | |
chain_type="stuff", | |
vectorstore=docsearch, | |
chain_type_kwargs=chain_type_kwargs | |
) | |
result = chain({"query": query}) | |
return result | |
CONFIG = { | |
"bearer_token": os.environ.get("bearer_token") | |
} | |
sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification' | |
topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification' | |
task = 'text-classification' | |
sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"} | |
topics = { | |
"0": "Analyst Update", | |
"1": "Fed | Central Banks", | |
"2": "Company | Product News", | |
"3": "Treasuries | Corporate Debt", | |
"4": "Dividend", | |
"5": "Earnings", | |
"6": "Energy | Oil", | |
"7": "Financials", | |
"8": "Currencies", | |
"9": "General News | Opinion", | |
"10": "Gold | Metals | Materials", | |
"11": "IPO", | |
"12": "Legal | Regulation", | |
"13": "M&A | Investments", | |
"14": "Macro", | |
"15": "Markets", | |
"16": "Politics", | |
"17": "Personnel Change", | |
"18": "Stock Commentary", | |
"19": "Stock Movement", | |
} | |
user_name = [ | |
"Investing.com", | |
"(((The Daily Shot)))", | |
"Bloomberg Markets", | |
"FirstSquawk", | |
"MarketWatch", | |
"markets", | |
"FinancialTimes", | |
"CNBC", | |
"ReutersBiz", | |
"BreakingNews", | |
"LiveSquawk", | |
"NYSE", | |
"WSJmarkets", | |
"FT", | |
"TheStreet", | |
"ftfinancenews", | |
"BloombergTV", | |
"Nasdaq", | |
"NYSE", | |
"federalreserve", | |
"NewYorkFed", | |
"sffed", | |
"WSJCentralBanks", | |
"RichmondFed", | |
"ecb", | |
"stlouisfed", | |
"WorldBank", | |
"MarketCurrents", | |
"OpenOutcrier", | |
"BullTradeFinder", | |
"WallStChatter", | |
"Briefingcom", | |
"SeekingAlpha", | |
"realDonaldTrump", | |
"AswathDamodaran", | |
"ukarlewitz", | |
"alphatrends", | |
"Investor666", | |
"ACInvestorBlog", | |
"ZorTrades", | |
"ScottNations", | |
"TradersCorner", | |
"TraderGoalieOne", | |
"option_snipper", | |
"jasonleavitt", | |
"LMT978", | |
"OptionsHawk", | |
"andrewbtodd", | |
"Terri1618", | |
"SunriseTrader", | |
"traderstewie", | |
"TMLTrader", | |
"IncredibleTrade", | |
"NYFedResearch", | |
"YahooFinance", | |
"business", | |
"economics", | |
"IMFNews", | |
"Market_Screener", | |
"QuickTake", | |
"NewsFromBW", | |
"BNCommodities", | |
] | |
user_id = [ | |
"988955288", | |
"423769635", | |
"69620713", | |
"59393368", | |
"3295423333", | |
"624413", | |
"69620713", | |
"4898091", | |
"20402945", | |
"15110357", | |
"6017542", | |
"21323268", | |
"28164923", | |
"18949452", | |
"15281391", | |
"11014272", | |
"35002876", | |
"18639734", | |
"21323268", | |
"26538229", | |
"15072071", | |
"117237387", | |
"327484803", | |
"16532451", | |
"83466368", | |
"71567590", | |
"27860681", | |
"15296897", | |
"2334614718", | |
"2222635612", | |
"3382363841", | |
"72928001", | |
"23059499", | |
"25073877", | |
"33216611", | |
"37284991", | |
"15246621", | |
"293458690", | |
"55561590", | |
"18560146", | |
"244978426", | |
"85523269", | |
"276714687", | |
"2806294664", | |
"16205561", | |
"1064700308", | |
"61342056", | |
"184126162", | |
"405820375", | |
"787439438964068352", | |
"52166809", | |
"2715646770", | |
"47247213", | |
"374672240", | |
"19546277", | |
"34713362", | |
"144274618", | |
"25098482", | |
"102325185", | |
"252751061", | |
"976297820532518914", | |
"804556370", | |
] | |
sentiment_classifier, topic_classifier = load_models() | |
def convert_user_names(user_name: list): | |
'''convert user_names to tweepy format''' | |
users = [] | |
for user in user_name: | |
users.append(f"from:{user}") | |
return " OR ".join(users) |