Spaces:

HeavenWaters
/

GenAISubmission

Running

App Files Files Community

HeavenWaters commited on May 3

Commit

65a4e68

•

1 Parent(s): fad4db7

Update api.py

Browse files

Files changed (1) hide show

api.py +12 -140

api.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# Install the faiss package first:
-# pip install faiss
 from flask import Flask, request, jsonify
 from dotenv import load_dotenv
 import pandas as pd
@@ -8,25 +5,19 @@ from PyPDF2 import PdfReader
 import openai
 import spacy
 from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
-#from langchain.text_splitter import CharacterTextSplitter
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
-#from langchain_core.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import HuggingFaceHub
-# from bert_score import score
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-#from transformers import AutoTokenizer, AutoModel
 from scipy.spatial.distance import cosine
 import markdown
 import os
 import pickle
 from flask_cors import CORS
-#import pandas as pd
-#import ast
 import requests
 import numpy as np
@@ -46,11 +37,6 @@ def get_pdf_text(pdf_docs):
     return text
 def get_text_chunks(raw_text):
-    """
-    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=400, length_function=len)
-    chunks = text_splitter.split_text(raw_text)
-    return chunks
-    """
     model = SentenceTransformersSimilarity()
     sentence_splitter = SpacySentenceSplitter()
     splitter = SimilarSentenceSplitter(model, sentence_splitter)
@@ -58,11 +44,10 @@ def get_text_chunks(raw_text):
     return chunks
 def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
-    print("I'm in")
     if os.path.exists(vectorstore_filename):
         with open(vectorstore_filename, 'rb') as file:
             vectorstore = pickle.load(file)
-            print("Hello vectorstore loaded")
     else:
         embeddings = OpenAIEmbeddings()
         vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
@@ -72,10 +57,11 @@ def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
     return vectorstore
 def get_conversation_chain(vectorstore):
-    llm = ChatOpenAI(max_tokens=300)
     # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
     #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
     #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
     memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
     conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
     return conversation_chain
@@ -89,37 +75,16 @@ def send_to_backend():
-    url = "http://119.63.132.178:8080/chat"
-    response = ""
-    try:
-        # Make a GET request to the API
-        ques = {'text':question}
-        print("ques",ques)
-        response = requests.post(url,json=ques)
-    except:
-        response = "couldn't get response!"
-    if response != "couldn't get response!":
-        llamaAnswer = response.json().get('Answer')
-    else:
-        llamaAnswer = response
-    # Call your backend function or API here, and replace the following lines with your actual logic
-    """
-    'question': "Respond to the Input in an appropriate manner while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
-    'question': "Respond to the Input while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
-    """
     try:
-        response_content = conversation({'question': question+f" (You can use this additional context in addition to the context\n Additional-Context: {llamaAnswer})"})
     except:
         print("conversation chain limit exceeded")
         text_chunks = ""
         vectorstore = get_vectorstore(text_chunks)
         conversation = get_conversation_chain(vectorstore)
-        response_content = conversation({'question': question+f" (You can use this additional context in addition to the context \n Additional-Context: {llamaAnswer})"})
-    #response_message = markdown.markdown(response_content.get('answer'))
     response_message = response_content.get('answer')
     response_context = response_content.get('source_documents')
     #P, R, F1 = score([response_message], [str(response_context)],lang="en")
@@ -127,39 +92,13 @@ def send_to_backend():
     F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
     F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
-    #print("context: "+str(response_context))
-    print("answer: " + response_message)
-    print("llamaanswer: "+llamaAnswer)
-    #If the 'Context' does not contain relevant information then respond to the 'Input' in any appropriate manner."
-    #Formatting and combine responses
-    # prompt = [{"role": "user", "content": f"""
-    #            Respond to the given 'Input' using the provided contexts 'Context1' and 'Context2' respectivley. Prefer to include information from 'Context1' in your response. For inputs such as "Hey," "Hi," or "Can you help me," or any other greetings in any languages respond from 'Context1'. Do not use phrase like "from the context provided" and do not refer to 'Context1', 'Context2' in any way in your response. Your response should be formatted using HTML tags for improved readability.</p>
-    #             Input: [{question}] \n
-    #             Context1: [{response_message}] \n
-    #             Context2:[{llamaAnswer}]
-    #            """}]
-    prompt = f"""
-Use HTML formatting to make the text labelled 'Text' to improve readibility. Do not refer to 'Text' in you response.\n
-            Text: {response_message}
-            """
-    response = openai.Completion.create(
-        engine="gpt-3.5-turbo-instruct",  # GPT-3 Base model
-        prompt=prompt,
-        max_tokens = 500 # You can adjust this based on your desired question length
-    )
-    finalAnswer = markdown.markdown(response.choices[0].text.strip())
-    print("final Answer:", finalAnswer)
-    return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">RAGAS SCORE: """+str(F1)+"""</p>"""})
 if __name__ == '__main__':
     load_dotenv()
@@ -167,10 +106,11 @@ if __name__ == '__main__':
         # If not installed, download and install the model
         spacy.cli.download("en_core_web_sm")
-    print("I'm here")
     pdf_docs = ["totaltax.pdf"]
     raw_text = get_pdf_text(pdf_docs)
-    # #text_chunkslist = get_text_chunks(raw_text)
     raw_text1 = raw_text[0:999999]
     raw_text2=raw_text[999000:]
     text_chunks1 = get_text_chunks(raw_text1)
@@ -181,79 +121,11 @@ if __name__ == '__main__':
         textelem = str(chunk)
         textelem = textelem[1:len(textelem)-2]
         text_chunks.append(textelem)
-    print("I'm here 1")
-    # text_chunks = ""
     vectorstore = get_vectorstore(text_chunks)
     conversation = get_conversation_chain(vectorstore)
-    print("I'm here 2")
-    """
-    questions = []
-    answers = []
-    generated_answers = []
-    contexts=[]
-    questioncontext = {}
-    excel_file_path = 'MMRRetriever\MistralT0.9MMR.xlsx'
-    text_file_path = 'output.txt'
-    df = pd.read_excel(excel_file_path)
-    questions = df['question'].tolist()
-    answers = df['ground_truths'].tolist()
-    """
-    """
-    with open(text_file_path, 'r') as file:
-        file_content = file.read()
-    generated_answers = ast.literal_eval(file_content)
-    n = len(generated_answers)
-    questions = questions[n:]
-    """
-    """
-    for question in questions:
-        try:
-            response = conversation({'question': str(question)})
-        except:
-            conversation = get_conversation_chain(vectorstore)
-            response = conversation({'question': question})
-        print(len(response['answer']))
-        generated_answers.append(str(response['answer']))
-        print(len(str(response['source_documents'])))
-        contexts.append(str(response['source_documents']))
-        with open(text_file_path, 'w') as file:
-            file.write(str(generated_answers))
-    print(len(generated_answers))
-    while len(contexts) != len(generated_answers):
-        contexts.append("")
-    df["context"] = contexts
-    df["generated_answer"] = generated_answers
-    P, R, F1 = score(generated_answers, answers,lang="en")
-    F1array = list(F1)
-    df["Bert Score"] = F1array
-    combinedScore = F1.mean()
-    print(combinedScore)
-    #F1array.append(combinedScore)
-    df["Bert Score"] = F1array
-    df.to_excel(excel_file_path, index=False)
-    """
-    print("I'm done")
     app.run(port=3000)

 from flask import Flask, request, jsonify
 from dotenv import load_dotenv
 import pandas as pd
 import openai
 import spacy
 from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import HuggingFaceHub
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from scipy.spatial.distance import cosine
 import markdown
 import os
 import pickle
 from flask_cors import CORS
 import requests
 import numpy as np
     return text
 def get_text_chunks(raw_text):
     model = SentenceTransformersSimilarity()
     sentence_splitter = SpacySentenceSplitter()
     splitter = SimilarSentenceSplitter(model, sentence_splitter)
     return chunks
 def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
     if os.path.exists(vectorstore_filename):
         with open(vectorstore_filename, 'rb') as file:
             vectorstore = pickle.load(file)
+            print("vectorstore loaded")
     else:
         embeddings = OpenAIEmbeddings()
         vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 def get_conversation_chain(vectorstore):
+    #llm = ChatOpenAI(max_tokens=300)
     # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
     #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
     #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
+    llm = HuggingFaceHub(repo_id="openai-community/gpt2-xl", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
     memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
     conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
     return conversation_chain
     try:
+        response_content = conversation({'question': question})
     except:
         print("conversation chain limit exceeded")
         text_chunks = ""
         vectorstore = get_vectorstore(text_chunks)
         conversation = get_conversation_chain(vectorstore)
+        response_content = conversation({'question': question})
     response_message = response_content.get('answer')
     response_context = response_content.get('source_documents')
     #P, R, F1 = score([response_message], [str(response_context)],lang="en")
     F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
     F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
+    finalAnswer = markdown.markdown(response_message)
+    #print("final Answer:", finalAnswer)
+    return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">F1 SCORE: """+str(F1)+"""</p>"""})
 if __name__ == '__main__':
     load_dotenv()
         # If not installed, download and install the model
         spacy.cli.download("en_core_web_sm")
+    #dataset to FAISS Vector Index
     pdf_docs = ["totaltax.pdf"]
     raw_text = get_pdf_text(pdf_docs)
+    #split
     raw_text1 = raw_text[0:999999]
     raw_text2=raw_text[999000:]
     text_chunks1 = get_text_chunks(raw_text1)
         textelem = str(chunk)
         textelem = textelem[1:len(textelem)-2]
         text_chunks.append(textelem)
+    #create vector store and conversational retrieval chain
     vectorstore = get_vectorstore(text_chunks)
     conversation = get_conversation_chain(vectorstore)
     app.run(port=3000)