bohmian commited on
Commit
1de170c
1 Parent(s): 53f5863
Reference_Chat_with_Earnings_Calls_Transcripts.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # for setting/extracting environment variables such as API keys
2
+ import os
3
+
4
+ ### 1. For Web Scraping
5
+ # for querying Financial Modelling Prep API
6
+ from urllib.request import urlopen
7
+ import json
8
+
9
+ ### 2. For Converting Scraped Text Into a Vector Store of Chunked Documents
10
+ # for tokenizing texts and splitting them into chunks of documents
11
+ from transformers import GPT2TokenizerFast
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ # for turning documents into embeddings before putting them in vector store
14
+ from langchain.embeddings import HuggingFaceEmbeddings
15
+ # for vector store for documents
16
+ from langchain.vectorstores import Chroma
17
+
18
+ ### 3. For Querying LLM
19
+ # for loading HuggingFace LLM models from the hub
20
+ from langchain.llms import HuggingFaceHub
21
+ # for querying LLM conveniently using the context
22
+ from langchain.chains.question_answering import load_qa_chain
23
+
24
+ ### 4. For Gradio App UI
25
+ import gradio as gr
26
+
27
+ fmp_api_key = os.environ['FMP_API_KEY']
28
+
29
+
30
+ def get_jsonparsed_data(url):
31
+ response = urlopen(url)
32
+ data = response.read().decode("utf-8")
33
+ return json.loads(data)
34
+
35
+ # initialize the following tokenizers and splitters to tokenize and split the texts into chunks later (feel free to try others)
36
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
37
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=200, chunk_overlap=20)
38
+
39
+ # initialize the default model for embedding the tokenized texts, the articles will be stored in this embedded form in the vector database
40
+ hf_embeddings = HuggingFaceEmbeddings()
41
+
42
+ # Load the huggingface inference endpoint of an LLM model
43
+ # Name of the LLM model we are using, feel free to try others!
44
+ model = "mistralai/Mistral-7B-Instruct-v0.1"
45
+
46
+ # This is an inference endpoint API from huggingface, the model is not run locally, it is run on huggingface
47
+ hf_llm = HuggingFaceHub(repo_id=model,model_kwargs={'temperature':0.5,"max_new_tokens":300})
48
+
49
+ os.system("rm -r chromadb_earnings_transcripts_extracted")
50
+ os.system("rm earnings_transcripts_chromadb.zip")
51
+ os.system("wget https://github.com/damianboh/test_earnings_calls/raw/main/earnings_transcripts_chromadb.zip")
52
+ os.system("unzip earnings_transcripts_chromadb.zip -d chromadb_earnings_transcripts_extracted")
53
+
54
+ chroma_db = Chroma(persist_directory='chromadb_earnings_transcripts_extracted/chromadb_earnings_transcripts',embedding_function=hf_embeddings)
55
+
56
+
57
+
58
+ def source_question_answer(query:str,vectorstore:Chroma=chroma_db,llm:HuggingFaceHub=hf_llm):
59
+ """
60
+ Return answer to the query
61
+ """
62
+ input_docs = vectorstore.similarity_search(query,k=4)
63
+ qa_chain = load_qa_chain(llm, chain_type="stuff")
64
+ query = f"[INST]According to the earnings calls transcripts earlier, {query}[INST]"
65
+
66
+ response = qa_chain.run(input_documents=input_docs, question=query)
67
+ source_docs_1 = input_docs[0].page_content
68
+ source_docs_2 = input_docs[1].page_content
69
+ source_docs_3 = input_docs[2].page_content
70
+ source_docs_4 = input_docs[3].page_content
71
+
72
+ source_title_1 = input_docs[0].metadata['title']
73
+ source_title_2 = input_docs[1].metadata['title']
74
+ source_title_3 = input_docs[2].metadata['title']
75
+ source_title_4 = input_docs[3].metadata['title']
76
+
77
+ return response,source_docs_1 ,source_docs_2,source_docs_3,source_docs_4, source_title_1, source_title_2, source_title_3, source_title_4
78
+
79
+
80
+ with gr.Blocks() as app:
81
+
82
+ with gr.Row():
83
+ gr.HTML("<h1>Chat with Tesla 2023 Earnings Calls Transcripts</h1>")
84
+
85
+ with gr.Row():
86
+ query = gr.Textbox("Is Elon happy about Tesla?", placeholder="Enter question here...", label="Enter question")
87
+ btn = gr.Button("Ask Question")
88
+
89
+ with gr.Row():
90
+ gr.HTML("<h3>Answer</h3>")
91
+
92
+ with gr.Row():
93
+ answer = gr.Textbox(label="Answer")
94
+
95
+ with gr.Row():
96
+ gr.HTML("<h3>Sources Referenced from Tesla 2023 Earnings Calls Transcripts</h3>")
97
+
98
+ with gr.Row():
99
+ with gr.Column():
100
+ source_title_1 = gr.Markdown()
101
+ source1 = gr.Textbox(label="Source Text 1")
102
+ with gr.Column():
103
+ source_title_2 = gr.Markdown()
104
+ source2 = gr.Textbox(label="Source Text 2")
105
+
106
+ with gr.Row():
107
+ with gr.Column():
108
+ source_title_3 = gr.Markdown()
109
+ source3 = gr.Textbox(label="Source Text 3")
110
+ with gr.Column():
111
+ source_title_4 = gr.Markdown()
112
+ source4 = gr.Textbox(label="Source Text 4")
113
+
114
+ query.submit(fn=source_question_answer, inputs=[query],
115
+ outputs=[answer, source1, source2, source3, source4, source_title_1, source_title_2, source_title_3, source_title_4])
116
+
117
+ btn.click(fn=source_question_answer, inputs=[query],
118
+ outputs=[answer, source1, source2, source3, source4, source_title_1, source_title_2, source_title_3, source_title_4])
119
+
120
+ app.launch(share=True, debug=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==3.45.2
2
+ chromadb==0.4.13
3
+ langchain==0.0.305
4
+ transformers==4.33.3
5
+ sentence-transformers==2.2.2