Spaces:
Running
Running
Commit
•
65a4e68
1
Parent(s):
fad4db7
Update api.py
Browse files
api.py
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
-
# Install the faiss package first:
|
2 |
-
# pip install faiss
|
3 |
-
|
4 |
from flask import Flask, request, jsonify
|
5 |
from dotenv import load_dotenv
|
6 |
import pandas as pd
|
@@ -8,25 +5,19 @@ from PyPDF2 import PdfReader
|
|
8 |
import openai
|
9 |
import spacy
|
10 |
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
|
11 |
-
#from langchain.text_splitter import CharacterTextSplitter
|
12 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
13 |
from langchain.vectorstores import FAISS
|
14 |
from langchain.memory import ConversationBufferMemory
|
15 |
from langchain.chains import ConversationalRetrievalChain
|
16 |
-
#from langchain_core.prompts import PromptTemplate
|
17 |
from langchain.chat_models import ChatOpenAI
|
18 |
from langchain.llms import HuggingFaceHub
|
19 |
-
# from bert_score import score
|
20 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
21 |
from sklearn.metrics.pairwise import cosine_similarity
|
22 |
-
#from transformers import AutoTokenizer, AutoModel
|
23 |
from scipy.spatial.distance import cosine
|
24 |
import markdown
|
25 |
import os
|
26 |
import pickle
|
27 |
from flask_cors import CORS
|
28 |
-
#import pandas as pd
|
29 |
-
#import ast
|
30 |
import requests
|
31 |
import numpy as np
|
32 |
|
@@ -46,11 +37,6 @@ def get_pdf_text(pdf_docs):
|
|
46 |
return text
|
47 |
|
48 |
def get_text_chunks(raw_text):
|
49 |
-
"""
|
50 |
-
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=400, length_function=len)
|
51 |
-
chunks = text_splitter.split_text(raw_text)
|
52 |
-
return chunks
|
53 |
-
"""
|
54 |
model = SentenceTransformersSimilarity()
|
55 |
sentence_splitter = SpacySentenceSplitter()
|
56 |
splitter = SimilarSentenceSplitter(model, sentence_splitter)
|
@@ -58,11 +44,10 @@ def get_text_chunks(raw_text):
|
|
58 |
return chunks
|
59 |
|
60 |
def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
|
61 |
-
print("I'm in")
|
62 |
if os.path.exists(vectorstore_filename):
|
63 |
with open(vectorstore_filename, 'rb') as file:
|
64 |
vectorstore = pickle.load(file)
|
65 |
-
print("
|
66 |
else:
|
67 |
embeddings = OpenAIEmbeddings()
|
68 |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
@@ -72,10 +57,11 @@ def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
|
|
72 |
return vectorstore
|
73 |
|
74 |
def get_conversation_chain(vectorstore):
|
75 |
-
llm = ChatOpenAI(max_tokens=300)
|
76 |
# llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
|
77 |
#llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
|
78 |
#llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
|
|
|
79 |
memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
|
80 |
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
|
81 |
return conversation_chain
|
@@ -89,37 +75,16 @@ def send_to_backend():
|
|
89 |
|
90 |
|
91 |
|
92 |
-
url = "http://119.63.132.178:8080/chat"
|
93 |
-
response = ""
|
94 |
-
try:
|
95 |
-
# Make a GET request to the API
|
96 |
-
ques = {'text':question}
|
97 |
-
print("ques",ques)
|
98 |
-
response = requests.post(url,json=ques)
|
99 |
-
except:
|
100 |
-
response = "couldn't get response!"
|
101 |
-
|
102 |
-
if response != "couldn't get response!":
|
103 |
-
llamaAnswer = response.json().get('Answer')
|
104 |
-
else:
|
105 |
-
llamaAnswer = response
|
106 |
-
|
107 |
-
# Call your backend function or API here, and replace the following lines with your actual logic
|
108 |
-
"""
|
109 |
-
'question': "Respond to the Input in an appropriate manner while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
|
110 |
-
'question': "Respond to the Input while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
|
111 |
-
"""
|
112 |
try:
|
113 |
-
response_content = conversation({'question': question
|
114 |
except:
|
115 |
print("conversation chain limit exceeded")
|
116 |
text_chunks = ""
|
117 |
vectorstore = get_vectorstore(text_chunks)
|
118 |
conversation = get_conversation_chain(vectorstore)
|
119 |
-
response_content = conversation({'question': question
|
120 |
|
121 |
|
122 |
-
#response_message = markdown.markdown(response_content.get('answer'))
|
123 |
response_message = response_content.get('answer')
|
124 |
response_context = response_content.get('source_documents')
|
125 |
#P, R, F1 = score([response_message], [str(response_context)],lang="en")
|
@@ -127,39 +92,13 @@ def send_to_backend():
|
|
127 |
F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
|
128 |
F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
|
129 |
|
130 |
-
#print("context: "+str(response_context))
|
131 |
-
print("answer: " + response_message)
|
132 |
-
print("llamaanswer: "+llamaAnswer)
|
133 |
-
|
134 |
-
#If the 'Context' does not contain relevant information then respond to the 'Input' in any appropriate manner."
|
135 |
-
#Formatting and combine responses
|
136 |
-
|
137 |
-
# prompt = [{"role": "user", "content": f"""
|
138 |
-
# Respond to the given 'Input' using the provided contexts 'Context1' and 'Context2' respectivley. Prefer to include information from 'Context1' in your response. For inputs such as "Hey," "Hi," or "Can you help me," or any other greetings in any languages respond from 'Context1'. Do not use phrase like "from the context provided" and do not refer to 'Context1', 'Context2' in any way in your response. Your response should be formatted using HTML tags for improved readability.</p>
|
139 |
-
# Input: [{question}] \n
|
140 |
-
# Context1: [{response_message}] \n
|
141 |
-
# Context2:[{llamaAnswer}]
|
142 |
-
# """}]
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
prompt = f"""
|
147 |
-
Use HTML formatting to make the text labelled 'Text' to improve readibility. Do not refer to 'Text' in you response.\n
|
148 |
-
Text: {response_message}
|
149 |
-
"""
|
150 |
-
|
151 |
-
response = openai.Completion.create(
|
152 |
-
engine="gpt-3.5-turbo-instruct", # GPT-3 Base model
|
153 |
-
prompt=prompt,
|
154 |
-
max_tokens = 500 # You can adjust this based on your desired question length
|
155 |
-
)
|
156 |
|
157 |
-
finalAnswer = markdown.markdown(
|
158 |
-
print("final Answer:", finalAnswer)
|
159 |
|
160 |
|
161 |
|
162 |
-
return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">
|
163 |
|
164 |
if __name__ == '__main__':
|
165 |
load_dotenv()
|
@@ -167,10 +106,11 @@ if __name__ == '__main__':
|
|
167 |
# If not installed, download and install the model
|
168 |
spacy.cli.download("en_core_web_sm")
|
169 |
|
170 |
-
|
171 |
pdf_docs = ["totaltax.pdf"]
|
172 |
raw_text = get_pdf_text(pdf_docs)
|
173 |
-
|
|
|
174 |
raw_text1 = raw_text[0:999999]
|
175 |
raw_text2=raw_text[999000:]
|
176 |
text_chunks1 = get_text_chunks(raw_text1)
|
@@ -181,79 +121,11 @@ if __name__ == '__main__':
|
|
181 |
textelem = str(chunk)
|
182 |
textelem = textelem[1:len(textelem)-2]
|
183 |
text_chunks.append(textelem)
|
184 |
-
print("I'm here 1")
|
185 |
|
186 |
-
|
187 |
-
# text_chunks = ""
|
188 |
vectorstore = get_vectorstore(text_chunks)
|
189 |
conversation = get_conversation_chain(vectorstore)
|
190 |
-
print("I'm here 2")
|
191 |
-
"""
|
192 |
-
questions = []
|
193 |
-
answers = []
|
194 |
-
generated_answers = []
|
195 |
-
contexts=[]
|
196 |
-
questioncontext = {}
|
197 |
-
|
198 |
-
|
199 |
-
excel_file_path = 'MMRRetriever\MistralT0.9MMR.xlsx'
|
200 |
-
text_file_path = 'output.txt'
|
201 |
|
202 |
-
df = pd.read_excel(excel_file_path)
|
203 |
-
questions = df['question'].tolist()
|
204 |
-
answers = df['ground_truths'].tolist()
|
205 |
-
"""
|
206 |
|
207 |
|
208 |
-
"""
|
209 |
-
with open(text_file_path, 'r') as file:
|
210 |
-
file_content = file.read()
|
211 |
-
|
212 |
-
generated_answers = ast.literal_eval(file_content)
|
213 |
-
|
214 |
-
n = len(generated_answers)
|
215 |
-
|
216 |
-
questions = questions[n:]
|
217 |
-
"""
|
218 |
-
"""
|
219 |
-
for question in questions:
|
220 |
-
try:
|
221 |
-
response = conversation({'question': str(question)})
|
222 |
-
except:
|
223 |
-
conversation = get_conversation_chain(vectorstore)
|
224 |
-
response = conversation({'question': question})
|
225 |
-
|
226 |
-
print(len(response['answer']))
|
227 |
-
generated_answers.append(str(response['answer']))
|
228 |
-
print(len(str(response['source_documents'])))
|
229 |
-
contexts.append(str(response['source_documents']))
|
230 |
-
|
231 |
-
with open(text_file_path, 'w') as file:
|
232 |
-
file.write(str(generated_answers))
|
233 |
-
|
234 |
-
print(len(generated_answers))
|
235 |
-
|
236 |
-
while len(contexts) != len(generated_answers):
|
237 |
-
contexts.append("")
|
238 |
-
|
239 |
-
df["context"] = contexts
|
240 |
-
df["generated_answer"] = generated_answers
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
P, R, F1 = score(generated_answers, answers,lang="en")
|
246 |
-
F1array = list(F1)
|
247 |
-
|
248 |
-
df["Bert Score"] = F1array
|
249 |
-
|
250 |
-
combinedScore = F1.mean()
|
251 |
-
print(combinedScore)
|
252 |
-
#F1array.append(combinedScore)
|
253 |
-
df["Bert Score"] = F1array
|
254 |
-
|
255 |
-
|
256 |
-
df.to_excel(excel_file_path, index=False)
|
257 |
-
"""
|
258 |
-
print("I'm done")
|
259 |
app.run(port=3000)
|
|
|
|
|
|
|
|
|
1 |
from flask import Flask, request, jsonify
|
2 |
from dotenv import load_dotenv
|
3 |
import pandas as pd
|
|
|
5 |
import openai
|
6 |
import spacy
|
7 |
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
|
|
|
8 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
9 |
from langchain.vectorstores import FAISS
|
10 |
from langchain.memory import ConversationBufferMemory
|
11 |
from langchain.chains import ConversationalRetrievalChain
|
|
|
12 |
from langchain.chat_models import ChatOpenAI
|
13 |
from langchain.llms import HuggingFaceHub
|
|
|
14 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
16 |
from scipy.spatial.distance import cosine
|
17 |
import markdown
|
18 |
import os
|
19 |
import pickle
|
20 |
from flask_cors import CORS
|
|
|
|
|
21 |
import requests
|
22 |
import numpy as np
|
23 |
|
|
|
37 |
return text
|
38 |
|
39 |
def get_text_chunks(raw_text):
|
|
|
|
|
|
|
|
|
|
|
40 |
model = SentenceTransformersSimilarity()
|
41 |
sentence_splitter = SpacySentenceSplitter()
|
42 |
splitter = SimilarSentenceSplitter(model, sentence_splitter)
|
|
|
44 |
return chunks
|
45 |
|
46 |
def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
|
|
|
47 |
if os.path.exists(vectorstore_filename):
|
48 |
with open(vectorstore_filename, 'rb') as file:
|
49 |
vectorstore = pickle.load(file)
|
50 |
+
print("vectorstore loaded")
|
51 |
else:
|
52 |
embeddings = OpenAIEmbeddings()
|
53 |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
|
|
57 |
return vectorstore
|
58 |
|
59 |
def get_conversation_chain(vectorstore):
|
60 |
+
#llm = ChatOpenAI(max_tokens=300)
|
61 |
# llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
|
62 |
#llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
|
63 |
#llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
|
64 |
+
llm = HuggingFaceHub(repo_id="openai-community/gpt2-xl", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
|
65 |
memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
|
66 |
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
|
67 |
return conversation_chain
|
|
|
75 |
|
76 |
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
+
response_content = conversation({'question': question})
|
80 |
except:
|
81 |
print("conversation chain limit exceeded")
|
82 |
text_chunks = ""
|
83 |
vectorstore = get_vectorstore(text_chunks)
|
84 |
conversation = get_conversation_chain(vectorstore)
|
85 |
+
response_content = conversation({'question': question})
|
86 |
|
87 |
|
|
|
88 |
response_message = response_content.get('answer')
|
89 |
response_context = response_content.get('source_documents')
|
90 |
#P, R, F1 = score([response_message], [str(response_context)],lang="en")
|
|
|
92 |
F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
|
93 |
F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
finalAnswer = markdown.markdown(response_message)
|
97 |
+
#print("final Answer:", finalAnswer)
|
98 |
|
99 |
|
100 |
|
101 |
+
return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""})
|
102 |
|
103 |
if __name__ == '__main__':
|
104 |
load_dotenv()
|
|
|
106 |
# If not installed, download and install the model
|
107 |
spacy.cli.download("en_core_web_sm")
|
108 |
|
109 |
+
#dataset to FAISS Vector Index
|
110 |
pdf_docs = ["totaltax.pdf"]
|
111 |
raw_text = get_pdf_text(pdf_docs)
|
112 |
+
|
113 |
+
#split
|
114 |
raw_text1 = raw_text[0:999999]
|
115 |
raw_text2=raw_text[999000:]
|
116 |
text_chunks1 = get_text_chunks(raw_text1)
|
|
|
121 |
textelem = str(chunk)
|
122 |
textelem = textelem[1:len(textelem)-2]
|
123 |
text_chunks.append(textelem)
|
|
|
124 |
|
125 |
+
#create vector store and conversational retrieval chain
|
|
|
126 |
vectorstore = get_vectorstore(text_chunks)
|
127 |
conversation = get_conversation_chain(vectorstore)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
|
|
|
|
|
|
|
|
129 |
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
app.run(port=3000)
|