HeavenWaters commited on
Commit
65a4e68
1 Parent(s): fad4db7

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +12 -140
api.py CHANGED
@@ -1,6 +1,3 @@
1
- # Install the faiss package first:
2
- # pip install faiss
3
-
4
  from flask import Flask, request, jsonify
5
  from dotenv import load_dotenv
6
  import pandas as pd
@@ -8,25 +5,19 @@ from PyPDF2 import PdfReader
8
  import openai
9
  import spacy
10
  from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
11
- #from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.embeddings.openai import OpenAIEmbeddings
13
  from langchain.vectorstores import FAISS
14
  from langchain.memory import ConversationBufferMemory
15
  from langchain.chains import ConversationalRetrievalChain
16
- #from langchain_core.prompts import PromptTemplate
17
  from langchain.chat_models import ChatOpenAI
18
  from langchain.llms import HuggingFaceHub
19
- # from bert_score import score
20
  from sklearn.feature_extraction.text import TfidfVectorizer
21
  from sklearn.metrics.pairwise import cosine_similarity
22
- #from transformers import AutoTokenizer, AutoModel
23
  from scipy.spatial.distance import cosine
24
  import markdown
25
  import os
26
  import pickle
27
  from flask_cors import CORS
28
- #import pandas as pd
29
- #import ast
30
  import requests
31
  import numpy as np
32
 
@@ -46,11 +37,6 @@ def get_pdf_text(pdf_docs):
46
  return text
47
 
48
  def get_text_chunks(raw_text):
49
- """
50
- text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=400, length_function=len)
51
- chunks = text_splitter.split_text(raw_text)
52
- return chunks
53
- """
54
  model = SentenceTransformersSimilarity()
55
  sentence_splitter = SpacySentenceSplitter()
56
  splitter = SimilarSentenceSplitter(model, sentence_splitter)
@@ -58,11 +44,10 @@ def get_text_chunks(raw_text):
58
  return chunks
59
 
60
  def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
61
- print("I'm in")
62
  if os.path.exists(vectorstore_filename):
63
  with open(vectorstore_filename, 'rb') as file:
64
  vectorstore = pickle.load(file)
65
- print("Hello vectorstore loaded")
66
  else:
67
  embeddings = OpenAIEmbeddings()
68
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
@@ -72,10 +57,11 @@ def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
72
  return vectorstore
73
 
74
  def get_conversation_chain(vectorstore):
75
- llm = ChatOpenAI(max_tokens=300)
76
  # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
77
  #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
78
  #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
 
79
  memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
80
  conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
81
  return conversation_chain
@@ -89,37 +75,16 @@ def send_to_backend():
89
 
90
 
91
 
92
- url = "http://119.63.132.178:8080/chat"
93
- response = ""
94
- try:
95
- # Make a GET request to the API
96
- ques = {'text':question}
97
- print("ques",ques)
98
- response = requests.post(url,json=ques)
99
- except:
100
- response = "couldn't get response!"
101
-
102
- if response != "couldn't get response!":
103
- llamaAnswer = response.json().get('Answer')
104
- else:
105
- llamaAnswer = response
106
-
107
- # Call your backend function or API here, and replace the following lines with your actual logic
108
- """
109
- 'question': "Respond to the Input in an appropriate manner while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
110
- 'question': "Respond to the Input while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
111
- """
112
  try:
113
- response_content = conversation({'question': question+f" (You can use this additional context in addition to the context\n Additional-Context: {llamaAnswer})"})
114
  except:
115
  print("conversation chain limit exceeded")
116
  text_chunks = ""
117
  vectorstore = get_vectorstore(text_chunks)
118
  conversation = get_conversation_chain(vectorstore)
119
- response_content = conversation({'question': question+f" (You can use this additional context in addition to the context \n Additional-Context: {llamaAnswer})"})
120
 
121
 
122
- #response_message = markdown.markdown(response_content.get('answer'))
123
  response_message = response_content.get('answer')
124
  response_context = response_content.get('source_documents')
125
  #P, R, F1 = score([response_message], [str(response_context)],lang="en")
@@ -127,39 +92,13 @@ def send_to_backend():
127
  F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
128
  F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
129
 
130
- #print("context: "+str(response_context))
131
- print("answer: " + response_message)
132
- print("llamaanswer: "+llamaAnswer)
133
-
134
- #If the 'Context' does not contain relevant information then respond to the 'Input' in any appropriate manner."
135
- #Formatting and combine responses
136
-
137
- # prompt = [{"role": "user", "content": f"""
138
- # Respond to the given 'Input' using the provided contexts 'Context1' and 'Context2' respectivley. Prefer to include information from 'Context1' in your response. For inputs such as "Hey," "Hi," or "Can you help me," or any other greetings in any languages respond from 'Context1'. Do not use phrase like "from the context provided" and do not refer to 'Context1', 'Context2' in any way in your response. Your response should be formatted using HTML tags for improved readability.</p>
139
- # Input: [{question}] \n
140
- # Context1: [{response_message}] \n
141
- # Context2:[{llamaAnswer}]
142
- # """}]
143
-
144
-
145
-
146
- prompt = f"""
147
- Use HTML formatting to make the text labelled 'Text' to improve readibility. Do not refer to 'Text' in you response.\n
148
- Text: {response_message}
149
- """
150
-
151
- response = openai.Completion.create(
152
- engine="gpt-3.5-turbo-instruct", # GPT-3 Base model
153
- prompt=prompt,
154
- max_tokens = 500 # You can adjust this based on your desired question length
155
- )
156
 
157
- finalAnswer = markdown.markdown(response.choices[0].text.strip())
158
- print("final Answer:", finalAnswer)
159
 
160
 
161
 
162
- return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">RAGAS SCORE: """+str(F1)+"""</p>"""})
163
 
164
  if __name__ == '__main__':
165
  load_dotenv()
@@ -167,10 +106,11 @@ if __name__ == '__main__':
167
  # If not installed, download and install the model
168
  spacy.cli.download("en_core_web_sm")
169
 
170
- print("I'm here")
171
  pdf_docs = ["totaltax.pdf"]
172
  raw_text = get_pdf_text(pdf_docs)
173
- # #text_chunkslist = get_text_chunks(raw_text)
 
174
  raw_text1 = raw_text[0:999999]
175
  raw_text2=raw_text[999000:]
176
  text_chunks1 = get_text_chunks(raw_text1)
@@ -181,79 +121,11 @@ if __name__ == '__main__':
181
  textelem = str(chunk)
182
  textelem = textelem[1:len(textelem)-2]
183
  text_chunks.append(textelem)
184
- print("I'm here 1")
185
 
186
-
187
- # text_chunks = ""
188
  vectorstore = get_vectorstore(text_chunks)
189
  conversation = get_conversation_chain(vectorstore)
190
- print("I'm here 2")
191
- """
192
- questions = []
193
- answers = []
194
- generated_answers = []
195
- contexts=[]
196
- questioncontext = {}
197
-
198
-
199
- excel_file_path = 'MMRRetriever\MistralT0.9MMR.xlsx'
200
- text_file_path = 'output.txt'
201
 
202
- df = pd.read_excel(excel_file_path)
203
- questions = df['question'].tolist()
204
- answers = df['ground_truths'].tolist()
205
- """
206
 
207
 
208
- """
209
- with open(text_file_path, 'r') as file:
210
- file_content = file.read()
211
-
212
- generated_answers = ast.literal_eval(file_content)
213
-
214
- n = len(generated_answers)
215
-
216
- questions = questions[n:]
217
- """
218
- """
219
- for question in questions:
220
- try:
221
- response = conversation({'question': str(question)})
222
- except:
223
- conversation = get_conversation_chain(vectorstore)
224
- response = conversation({'question': question})
225
-
226
- print(len(response['answer']))
227
- generated_answers.append(str(response['answer']))
228
- print(len(str(response['source_documents'])))
229
- contexts.append(str(response['source_documents']))
230
-
231
- with open(text_file_path, 'w') as file:
232
- file.write(str(generated_answers))
233
-
234
- print(len(generated_answers))
235
-
236
- while len(contexts) != len(generated_answers):
237
- contexts.append("")
238
-
239
- df["context"] = contexts
240
- df["generated_answer"] = generated_answers
241
-
242
-
243
-
244
-
245
- P, R, F1 = score(generated_answers, answers,lang="en")
246
- F1array = list(F1)
247
-
248
- df["Bert Score"] = F1array
249
-
250
- combinedScore = F1.mean()
251
- print(combinedScore)
252
- #F1array.append(combinedScore)
253
- df["Bert Score"] = F1array
254
-
255
-
256
- df.to_excel(excel_file_path, index=False)
257
- """
258
- print("I'm done")
259
  app.run(port=3000)
 
 
 
 
1
  from flask import Flask, request, jsonify
2
  from dotenv import load_dotenv
3
  import pandas as pd
 
5
  import openai
6
  import spacy
7
  from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
 
8
  from langchain.embeddings.openai import OpenAIEmbeddings
9
  from langchain.vectorstores import FAISS
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.chains import ConversationalRetrievalChain
 
12
  from langchain.chat_models import ChatOpenAI
13
  from langchain.llms import HuggingFaceHub
 
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
  from sklearn.metrics.pairwise import cosine_similarity
 
16
  from scipy.spatial.distance import cosine
17
  import markdown
18
  import os
19
  import pickle
20
  from flask_cors import CORS
 
 
21
  import requests
22
  import numpy as np
23
 
 
37
  return text
38
 
39
  def get_text_chunks(raw_text):
 
 
 
 
 
40
  model = SentenceTransformersSimilarity()
41
  sentence_splitter = SpacySentenceSplitter()
42
  splitter = SimilarSentenceSplitter(model, sentence_splitter)
 
44
  return chunks
45
 
46
  def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
 
47
  if os.path.exists(vectorstore_filename):
48
  with open(vectorstore_filename, 'rb') as file:
49
  vectorstore = pickle.load(file)
50
+ print("vectorstore loaded")
51
  else:
52
  embeddings = OpenAIEmbeddings()
53
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
 
57
  return vectorstore
58
 
59
  def get_conversation_chain(vectorstore):
60
+ #llm = ChatOpenAI(max_tokens=300)
61
  # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
62
  #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
63
  #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
64
+ llm = HuggingFaceHub(repo_id="openai-community/gpt2-xl", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
65
  memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
66
  conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
67
  return conversation_chain
 
75
 
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  try:
79
+ response_content = conversation({'question': question})
80
  except:
81
  print("conversation chain limit exceeded")
82
  text_chunks = ""
83
  vectorstore = get_vectorstore(text_chunks)
84
  conversation = get_conversation_chain(vectorstore)
85
+ response_content = conversation({'question': question})
86
 
87
 
 
88
  response_message = response_content.get('answer')
89
  response_context = response_content.get('source_documents')
90
  #P, R, F1 = score([response_message], [str(response_context)],lang="en")
 
92
  F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
93
  F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ finalAnswer = markdown.markdown(response_message)
97
+ #print("final Answer:", finalAnswer)
98
 
99
 
100
 
101
+ return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""})
102
 
103
  if __name__ == '__main__':
104
  load_dotenv()
 
106
  # If not installed, download and install the model
107
  spacy.cli.download("en_core_web_sm")
108
 
109
+ #dataset to FAISS Vector Index
110
  pdf_docs = ["totaltax.pdf"]
111
  raw_text = get_pdf_text(pdf_docs)
112
+
113
+ #split
114
  raw_text1 = raw_text[0:999999]
115
  raw_text2=raw_text[999000:]
116
  text_chunks1 = get_text_chunks(raw_text1)
 
121
  textelem = str(chunk)
122
  textelem = textelem[1:len(textelem)-2]
123
  text_chunks.append(textelem)
 
124
 
125
+ #create vector store and conversational retrieval chain
 
126
  vectorstore = get_vectorstore(text_chunks)
127
  conversation = get_conversation_chain(vectorstore)
 
 
 
 
 
 
 
 
 
 
 
128
 
 
 
 
 
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  app.run(port=3000)