GenAISubmission / api.py
HeavenWaters's picture
Update api.py
82d12e2 verified
raw
history blame
No virus
4.85 kB
from flask import Flask, request, jsonify
from dotenv import load_dotenv
import pandas as pd
from PyPDF2 import PdfReader
import openai
import spacy
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFaceHub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import markdown
import os
import pickle
from flask_cors import CORS
import requests
import numpy as np
app = Flask(__name__)
CORS(app)
conversation = ""
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text = text + page.extract_text()
return text
def get_text_chunks(raw_text):
model = SentenceTransformersSimilarity()
sentence_splitter = SpacySentenceSplitter()
splitter = SimilarSentenceSplitter(model, sentence_splitter)
chunks = splitter.split(raw_text)
return chunks
def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
if os.path.exists(vectorstore_filename):
with open(vectorstore_filename, 'rb') as file:
vectorstore = pickle.load(file)
print("vectorstore loaded")
else:
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
with open(vectorstore_filename, 'wb') as file:
pickle.dump(vectorstore, file)
return vectorstore
def get_conversation_chain(vectorstore):
llm = ChatOpenAI(max_tokens=400)
memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={'k': 3}), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
return conversation_chain
@app.route('/send_to_backend', methods=['POST'])
def send_to_backend():
global conversation
question = request.get_json().get("userMsg")
print("Question: ", question)
"""
#Request LlamaRAFT for answer#
url = "https://pretty-hounds-sip.loca.lt/chat"
data = {
"text": question
}
headers = {
'bypass-tunnel-reminder': 'true'
}
response = requests.post(url, json=data, headers=headers)
if response.status_code == 201:
llamaanswer = response.json()['Answer']
else:
llamaanswer = ""
"""
#GPTRAG
try:
response_content = conversation({'question': question})
except:
print("conversation chain limit exceeded")
text_chunks = ""
vectorstore = get_vectorstore(text_chunks)
conversation = get_conversation_chain(vectorstore)
response_content = conversation({'question': question})
response_message = response_content.get('answer')
response_context = response_content.get('source_documents')
"""
if llamaanswer != "":
response_message = llamaanswer
"""
documents = [response_message,str(response_context)]
F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
finalAnswer = markdown.markdown(response_message)
#print("final Answer:", finalAnswer)
return jsonify({"response": finalAnswer+"""<br><p style="color: red; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""})
if __name__ == '__main__':
if not spacy.util.is_package("en_core_web_sm"):
# If not installed, download and install the model
spacy.cli.download("en_core_web_sm")
#dataset to FAISS Vector Index
pdf_docs = ["totaltax.pdf"]
raw_text = get_pdf_text(pdf_docs)
#split
raw_text1 = raw_text[0:999999]
raw_text2=raw_text[999000:]
text_chunks1 = get_text_chunks(raw_text1)
text_chunks2=get_text_chunks(raw_text2)
text_chunkslist = text_chunks1+text_chunks2
text_chunks=[]
for chunk in text_chunkslist:
textelem = str(chunk)
textelem = textelem[1:len(textelem)-2]
text_chunks.append(textelem)
#create vector store and conversational retrieval chain
vectorstore = get_vectorstore(text_chunks)
conversation = get_conversation_chain(vectorstore)
app.run(port=5000)