HeavenWaters commited on
Commit
fad4db7
1 Parent(s): f6b7d10

Upload 3 files

Browse files
Files changed (3) hide show
  1. api.py +259 -0
  2. app.py +62 -0
  3. requirements.txt +17 -0
api.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install the faiss package first:
2
+ # pip install faiss
3
+
4
+ from flask import Flask, request, jsonify
5
+ from dotenv import load_dotenv
6
+ import pandas as pd
7
+ from PyPDF2 import PdfReader
8
+ import openai
9
+ import spacy
10
+ from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
11
+ #from langchain.text_splitter import CharacterTextSplitter
12
+ from langchain.embeddings.openai import OpenAIEmbeddings
13
+ from langchain.vectorstores import FAISS
14
+ from langchain.memory import ConversationBufferMemory
15
+ from langchain.chains import ConversationalRetrievalChain
16
+ #from langchain_core.prompts import PromptTemplate
17
+ from langchain.chat_models import ChatOpenAI
18
+ from langchain.llms import HuggingFaceHub
19
+ # from bert_score import score
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ #from transformers import AutoTokenizer, AutoModel
23
+ from scipy.spatial.distance import cosine
24
+ import markdown
25
+ import os
26
+ import pickle
27
+ from flask_cors import CORS
28
+ #import pandas as pd
29
+ #import ast
30
+ import requests
31
+ import numpy as np
32
+
33
+ app = Flask(__name__)
34
+ CORS(app)
35
+
36
+ conversation = ""
37
+
38
+
39
+
40
+ def get_pdf_text(pdf_docs):
41
+ text = ""
42
+ for pdf in pdf_docs:
43
+ pdf_reader = PdfReader(pdf)
44
+ for page in pdf_reader.pages:
45
+ text = text + page.extract_text()
46
+ return text
47
+
48
+ def get_text_chunks(raw_text):
49
+ """
50
+ text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=400, length_function=len)
51
+ chunks = text_splitter.split_text(raw_text)
52
+ return chunks
53
+ """
54
+ model = SentenceTransformersSimilarity()
55
+ sentence_splitter = SpacySentenceSplitter()
56
+ splitter = SimilarSentenceSplitter(model, sentence_splitter)
57
+ chunks = splitter.split(raw_text)
58
+ return chunks
59
+
60
+ def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
61
+ print("I'm in")
62
+ if os.path.exists(vectorstore_filename):
63
+ with open(vectorstore_filename, 'rb') as file:
64
+ vectorstore = pickle.load(file)
65
+ print("Hello vectorstore loaded")
66
+ else:
67
+ embeddings = OpenAIEmbeddings()
68
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
69
+ with open(vectorstore_filename, 'wb') as file:
70
+ pickle.dump(vectorstore, file)
71
+
72
+ return vectorstore
73
+
74
+ def get_conversation_chain(vectorstore):
75
+ llm = ChatOpenAI(max_tokens=300)
76
+ # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
77
+ #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
78
+ #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
79
+ memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
80
+ conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
81
+ return conversation_chain
82
+
83
+ @app.route('/send_to_backend', methods=['POST'])
84
+ def send_to_backend():
85
+ global conversation
86
+ question = request.get_json().get("userMsg")
87
+
88
+ print("Question: ", question)
89
+
90
+
91
+
92
+ url = "http://119.63.132.178:8080/chat"
93
+ response = ""
94
+ try:
95
+ # Make a GET request to the API
96
+ ques = {'text':question}
97
+ print("ques",ques)
98
+ response = requests.post(url,json=ques)
99
+ except:
100
+ response = "couldn't get response!"
101
+
102
+ if response != "couldn't get response!":
103
+ llamaAnswer = response.json().get('Answer')
104
+ else:
105
+ llamaAnswer = response
106
+
107
+ # Call your backend function or API here, and replace the following lines with your actual logic
108
+ """
109
+ 'question': "Respond to the Input in an appropriate manner while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
110
+ 'question': "Respond to the Input while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
111
+ """
112
+ try:
113
+ response_content = conversation({'question': question+f" (You can use this additional context in addition to the context\n Additional-Context: {llamaAnswer})"})
114
+ except:
115
+ print("conversation chain limit exceeded")
116
+ text_chunks = ""
117
+ vectorstore = get_vectorstore(text_chunks)
118
+ conversation = get_conversation_chain(vectorstore)
119
+ response_content = conversation({'question': question+f" (You can use this additional context in addition to the context \n Additional-Context: {llamaAnswer})"})
120
+
121
+
122
+ #response_message = markdown.markdown(response_content.get('answer'))
123
+ response_message = response_content.get('answer')
124
+ response_context = response_content.get('source_documents')
125
+ #P, R, F1 = score([response_message], [str(response_context)],lang="en")
126
+ documents = [response_message,str(response_context)]
127
+ F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
128
+ F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
129
+
130
+ #print("context: "+str(response_context))
131
+ print("answer: " + response_message)
132
+ print("llamaanswer: "+llamaAnswer)
133
+
134
+ #If the 'Context' does not contain relevant information then respond to the 'Input' in any appropriate manner."
135
+ #Formatting and combine responses
136
+
137
+ # prompt = [{"role": "user", "content": f"""
138
+ # Respond to the given 'Input' using the provided contexts 'Context1' and 'Context2' respectivley. Prefer to include information from 'Context1' in your response. For inputs such as "Hey," "Hi," or "Can you help me," or any other greetings in any languages respond from 'Context1'. Do not use phrase like "from the context provided" and do not refer to 'Context1', 'Context2' in any way in your response. Your response should be formatted using HTML tags for improved readability.</p>
139
+ # Input: [{question}] \n
140
+ # Context1: [{response_message}] \n
141
+ # Context2:[{llamaAnswer}]
142
+ # """}]
143
+
144
+
145
+
146
+ prompt = f"""
147
+ Use HTML formatting to make the text labelled 'Text' to improve readibility. Do not refer to 'Text' in you response.\n
148
+ Text: {response_message}
149
+ """
150
+
151
+ response = openai.Completion.create(
152
+ engine="gpt-3.5-turbo-instruct", # GPT-3 Base model
153
+ prompt=prompt,
154
+ max_tokens = 500 # You can adjust this based on your desired question length
155
+ )
156
+
157
+ finalAnswer = markdown.markdown(response.choices[0].text.strip())
158
+ print("final Answer:", finalAnswer)
159
+
160
+
161
+
162
+ return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">RAGAS SCORE: """+str(F1)+"""</p>"""})
163
+
164
+ if __name__ == '__main__':
165
+ load_dotenv()
166
+ if not spacy.util.is_package("en_core_web_sm"):
167
+ # If not installed, download and install the model
168
+ spacy.cli.download("en_core_web_sm")
169
+
170
+ print("I'm here")
171
+ pdf_docs = ["totaltax.pdf"]
172
+ raw_text = get_pdf_text(pdf_docs)
173
+ # #text_chunkslist = get_text_chunks(raw_text)
174
+ raw_text1 = raw_text[0:999999]
175
+ raw_text2=raw_text[999000:]
176
+ text_chunks1 = get_text_chunks(raw_text1)
177
+ text_chunks2=get_text_chunks(raw_text2)
178
+ text_chunkslist = text_chunks1+text_chunks2
179
+ text_chunks=[]
180
+ for chunk in text_chunkslist:
181
+ textelem = str(chunk)
182
+ textelem = textelem[1:len(textelem)-2]
183
+ text_chunks.append(textelem)
184
+ print("I'm here 1")
185
+
186
+
187
+ # text_chunks = ""
188
+ vectorstore = get_vectorstore(text_chunks)
189
+ conversation = get_conversation_chain(vectorstore)
190
+ print("I'm here 2")
191
+ """
192
+ questions = []
193
+ answers = []
194
+ generated_answers = []
195
+ contexts=[]
196
+ questioncontext = {}
197
+
198
+
199
+ excel_file_path = 'MMRRetriever\MistralT0.9MMR.xlsx'
200
+ text_file_path = 'output.txt'
201
+
202
+ df = pd.read_excel(excel_file_path)
203
+ questions = df['question'].tolist()
204
+ answers = df['ground_truths'].tolist()
205
+ """
206
+
207
+
208
+ """
209
+ with open(text_file_path, 'r') as file:
210
+ file_content = file.read()
211
+
212
+ generated_answers = ast.literal_eval(file_content)
213
+
214
+ n = len(generated_answers)
215
+
216
+ questions = questions[n:]
217
+ """
218
+ """
219
+ for question in questions:
220
+ try:
221
+ response = conversation({'question': str(question)})
222
+ except:
223
+ conversation = get_conversation_chain(vectorstore)
224
+ response = conversation({'question': question})
225
+
226
+ print(len(response['answer']))
227
+ generated_answers.append(str(response['answer']))
228
+ print(len(str(response['source_documents'])))
229
+ contexts.append(str(response['source_documents']))
230
+
231
+ with open(text_file_path, 'w') as file:
232
+ file.write(str(generated_answers))
233
+
234
+ print(len(generated_answers))
235
+
236
+ while len(contexts) != len(generated_answers):
237
+ contexts.append("")
238
+
239
+ df["context"] = contexts
240
+ df["generated_answer"] = generated_answers
241
+
242
+
243
+
244
+
245
+ P, R, F1 = score(generated_answers, answers,lang="en")
246
+ F1array = list(F1)
247
+
248
+ df["Bert Score"] = F1array
249
+
250
+ combinedScore = F1.mean()
251
+ print(combinedScore)
252
+ #F1array.append(combinedScore)
253
+ df["Bert Score"] = F1array
254
+
255
+
256
+ df.to_excel(excel_file_path, index=False)
257
+ """
258
+ print("I'm done")
259
+ app.run(port=3000)
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import streamlit as st
3
+ import subprocess
4
+ import requests
5
+
6
+ st.title("Tax Tajweez")
7
+
8
+ # Initialize session state if it doesn't exist
9
+ if "messages" not in st.session_state:
10
+ st.session_state.messages = []
11
+
12
+ # Display previous chat messages
13
+ for message in st.session_state.messages:
14
+ with st.chat_message(message["role"]):
15
+ st.markdown(message["content"], unsafe_allow_html=True)
16
+
17
+ # Get user input
18
+ if prompt := st.chat_input("Ask me anything related to income tax..."):
19
+ # Add user message to session state
20
+ st.session_state.messages.append({"role": "user", "content": prompt})
21
+ with st.chat_message("user"):
22
+ st.markdown(prompt)
23
+
24
+ # Get assistant response
25
+ with st.expander("Assistant Response", expanded=True):
26
+ with st.spinner("I'm thinking..."):
27
+ # Define the URL of the API endpoint
28
+ url = "http://localhost:3000/send_to_backend"
29
+ # Define the data you want to send in the request body
30
+ data = {"userMsg": prompt}
31
+ # Make the POST request
32
+ response = requests.post(url, json=data)
33
+ # Check if the request was successful (status code 200)
34
+ if response.status_code == 200:
35
+ # Render the assistant response with markdown and allow HTML
36
+ assistant_response = (response.json())['response']
37
+ if assistant_response not in [msg.get("content") for msg in st.session_state.messages if msg.get("role") == "assistant"]:
38
+ st.markdown(assistant_response, unsafe_allow_html=True)
39
+ # Add assistant's response to session state
40
+ st.session_state.messages.append({"role": "assistant", "content": assistant_response})
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+ else:
52
+ st.error(f"Error: {response.status_code}")
53
+
54
+
55
+
56
+
57
+
58
+
59
+ # Specify the path to the Python file you want to run
60
+ file_path = 'api.py'
61
+ # Run the Python file
62
+ subprocess.run(['python',file_path])
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ python-dotenv
4
+ pypdf2
5
+ pydantic
6
+ pandas
7
+ langchain==0.0.345
8
+ faiss-cpu==1.7.4
9
+ openai==0.28.0
10
+ huggingface_hub
11
+ sentence_transformers
12
+ semantic-split==0.1.0
13
+ tiktoken
14
+ cohere
15
+ spacy==3.7.2
16
+ markdown2
17
+ markdown