Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify | |
from dotenv import load_dotenv | |
import pandas as pd | |
from PyPDF2 import PdfReader | |
import openai | |
import spacy | |
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.llms import HuggingFaceHub | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from scipy.spatial.distance import cosine | |
import markdown | |
import os | |
import pickle | |
from flask_cors import CORS | |
import requests | |
import numpy as np | |
app = Flask(__name__) | |
CORS(app) | |
conversation = "" | |
def get_pdf_text(pdf_docs): | |
text = "" | |
for pdf in pdf_docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text = text + page.extract_text() | |
return text | |
def get_text_chunks(raw_text): | |
model = SentenceTransformersSimilarity() | |
sentence_splitter = SpacySentenceSplitter() | |
splitter = SimilarSentenceSplitter(model, sentence_splitter) | |
chunks = splitter.split(raw_text) | |
return chunks | |
def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"): | |
if os.path.exists(vectorstore_filename): | |
with open(vectorstore_filename, 'rb') as file: | |
vectorstore = pickle.load(file) | |
print("vectorstore loaded") | |
else: | |
embeddings = OpenAIEmbeddings() | |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
with open(vectorstore_filename, 'wb') as file: | |
pickle.dump(vectorstore, file) | |
return vectorstore | |
def get_conversation_chain(vectorstore): | |
llm = ChatOpenAI(max_tokens=400) | |
memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True) | |
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={'k': 3}), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True) | |
return conversation_chain | |
def send_to_backend(): | |
global conversation | |
question = request.get_json().get("userMsg") | |
print("Question: ", question) | |
""" | |
#Request LlamaRAFT for answer# | |
url = "https://pretty-hounds-sip.loca.lt/chat" | |
data = { | |
"text": question | |
} | |
headers = { | |
'bypass-tunnel-reminder': 'true' | |
} | |
response = requests.post(url, json=data, headers=headers) | |
if response.status_code == 201: | |
llamaanswer = response.json()['Answer'] | |
else: | |
llamaanswer = "" | |
""" | |
#GPTRAG | |
try: | |
response_content = conversation({'question': question}) | |
except: | |
print("conversation chain limit exceeded") | |
text_chunks = "" | |
vectorstore = get_vectorstore(text_chunks) | |
conversation = get_conversation_chain(vectorstore) | |
response_content = conversation({'question': question}) | |
response_message = response_content.get('answer') | |
response_context = response_content.get('source_documents') | |
""" | |
if llamaanswer != "": | |
response_message = llamaanswer | |
""" | |
documents = [response_message,str(response_context)] | |
F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents)) | |
F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0])) | |
finalAnswer = markdown.markdown(response_message) | |
#print("final Answer:", finalAnswer) | |
return jsonify({"response": finalAnswer+"""<br><p style="color: red; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""}) | |
if __name__ == '__main__': | |
if not spacy.util.is_package("en_core_web_sm"): | |
# If not installed, download and install the model | |
spacy.cli.download("en_core_web_sm") | |
#dataset to FAISS Vector Index | |
pdf_docs = ["totaltax.pdf"] | |
raw_text = get_pdf_text(pdf_docs) | |
#split | |
raw_text1 = raw_text[0:999999] | |
raw_text2=raw_text[999000:] | |
text_chunks1 = get_text_chunks(raw_text1) | |
text_chunks2=get_text_chunks(raw_text2) | |
text_chunkslist = text_chunks1+text_chunks2 | |
text_chunks=[] | |
for chunk in text_chunkslist: | |
textelem = str(chunk) | |
textelem = textelem[1:len(textelem)-2] | |
text_chunks.append(textelem) | |
#create vector store and conversational retrieval chain | |
vectorstore = get_vectorstore(text_chunks) | |
conversation = get_conversation_chain(vectorstore) | |
app.run(port=5000) | |