Spaces:
Sleeping
Sleeping
File size: 4,846 Bytes
fad4db7 65a4e68 fad4db7 14fe154 fad4db7 fbecbaf fad4db7 ad79c98 44d18a1 8e3acfa 14fe154 8e3acfa fad4db7 8e3acfa ad79c98 14fe154 fad4db7 65a4e68 fad4db7 65a4e68 fad4db7 14fe154 ad79c98 14fe154 ad79c98 14fe154 fad4db7 65a4e68 fad4db7 9a33c1a fad4db7 65a4e68 fad4db7 65a4e68 fad4db7 65a4e68 fad4db7 82d12e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
from flask import Flask, request, jsonify
from dotenv import load_dotenv
import pandas as pd
from PyPDF2 import PdfReader
import openai
import spacy
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFaceHub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import markdown
import os
import pickle
from flask_cors import CORS
import requests
import numpy as np
app = Flask(__name__)
CORS(app)
conversation = ""
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text = text + page.extract_text()
return text
def get_text_chunks(raw_text):
model = SentenceTransformersSimilarity()
sentence_splitter = SpacySentenceSplitter()
splitter = SimilarSentenceSplitter(model, sentence_splitter)
chunks = splitter.split(raw_text)
return chunks
def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
if os.path.exists(vectorstore_filename):
with open(vectorstore_filename, 'rb') as file:
vectorstore = pickle.load(file)
print("vectorstore loaded")
else:
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
with open(vectorstore_filename, 'wb') as file:
pickle.dump(vectorstore, file)
return vectorstore
def get_conversation_chain(vectorstore):
llm = ChatOpenAI(max_tokens=400)
memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={'k': 3}), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
return conversation_chain
@app.route('/send_to_backend', methods=['POST'])
def send_to_backend():
global conversation
question = request.get_json().get("userMsg")
print("Question: ", question)
"""
#Request LlamaRAFT for answer#
url = "https://pretty-hounds-sip.loca.lt/chat"
data = {
"text": question
}
headers = {
'bypass-tunnel-reminder': 'true'
}
response = requests.post(url, json=data, headers=headers)
if response.status_code == 201:
llamaanswer = response.json()['Answer']
else:
llamaanswer = ""
"""
#GPTRAG
try:
response_content = conversation({'question': question})
except:
print("conversation chain limit exceeded")
text_chunks = ""
vectorstore = get_vectorstore(text_chunks)
conversation = get_conversation_chain(vectorstore)
response_content = conversation({'question': question})
response_message = response_content.get('answer')
response_context = response_content.get('source_documents')
"""
if llamaanswer != "":
response_message = llamaanswer
"""
documents = [response_message,str(response_context)]
F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
finalAnswer = markdown.markdown(response_message)
#print("final Answer:", finalAnswer)
return jsonify({"response": finalAnswer+"""<br><p style="color: red; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""})
if __name__ == '__main__':
if not spacy.util.is_package("en_core_web_sm"):
# If not installed, download and install the model
spacy.cli.download("en_core_web_sm")
#dataset to FAISS Vector Index
pdf_docs = ["totaltax.pdf"]
raw_text = get_pdf_text(pdf_docs)
#split
raw_text1 = raw_text[0:999999]
raw_text2=raw_text[999000:]
text_chunks1 = get_text_chunks(raw_text1)
text_chunks2=get_text_chunks(raw_text2)
text_chunkslist = text_chunks1+text_chunks2
text_chunks=[]
for chunk in text_chunkslist:
textelem = str(chunk)
textelem = textelem[1:len(textelem)-2]
text_chunks.append(textelem)
#create vector store and conversational retrieval chain
vectorstore = get_vectorstore(text_chunks)
conversation = get_conversation_chain(vectorstore)
app.run(port=5000)
|