Spaces:

HeavenWaters
/

GenAISubmission

Sleeping

App Files Files Community

HeavenWaters commited on May 3

Commit

fad4db7

•

1 Parent(s): f6b7d10

Upload 3 files

Browse files

Files changed (3) hide show

api.py +259 -0
app.py +62 -0
requirements.txt +17 -0

api.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Install the faiss package first:
+# pip install faiss
+from flask import Flask, request, jsonify
+from dotenv import load_dotenv
+import pandas as pd
+from PyPDF2 import PdfReader
+import openai
+import spacy
+from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
+#from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+#from langchain_core.prompts import PromptTemplate
+from langchain.chat_models import ChatOpenAI
+from langchain.llms import HuggingFaceHub
+# from bert_score import score
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+#from transformers import AutoTokenizer, AutoModel
+from scipy.spatial.distance import cosine
+import markdown
+import os
+import pickle
+from flask_cors import CORS
+#import pandas as pd
+#import ast
+import requests
+import numpy as np
+app = Flask(__name__)
+CORS(app)
+conversation = ""
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text = text + page.extract_text()
+    return text
+def get_text_chunks(raw_text):
+    """
+    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=3000, chunk_overlap=400, length_function=len)
+    chunks = text_splitter.split_text(raw_text)
+    return chunks
+    """
+    model = SentenceTransformersSimilarity()
+    sentence_splitter = SpacySentenceSplitter()
+    splitter = SimilarSentenceSplitter(model, sentence_splitter)
+    chunks = splitter.split(raw_text)
+    return chunks
+def get_vectorstore(text_chunks, vectorstore_filename="vectorstore.faiss"):
+    print("I'm in")
+    if os.path.exists(vectorstore_filename):
+        with open(vectorstore_filename, 'rb') as file:
+            vectorstore = pickle.load(file)
+            print("Hello vectorstore loaded")
+    else:
+        embeddings = OpenAIEmbeddings()
+        vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+        with open(vectorstore_filename, 'wb') as file:
+            pickle.dump(vectorstore, file)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    llm = ChatOpenAI(max_tokens=300)
+    # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-v0.1", model_kwargs={"temperature":0.9, "max_length":2048},task="text-generation")
+    #llm = HuggingFaceHub(repo_id="microsoft/phi-2", model_kwargs={"temperature":0.1, "max_length":1024},task="text-generation")
+    #llm = HuggingFaceHub(repo_id="FinGPT/fingpt-mt_qwen-7b_lora", model_kwargs={"temperature":0.5, "max_length":1024},task="text-generation")
+    memory = ConversationBufferMemory(memory_key='chat_history',output_key='answer', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory ,response_if_no_docs_found="I don't have this information",rephrase_question=False,return_source_documents=True)
+    return conversation_chain
+@app.route('/send_to_backend', methods=['POST'])
+def send_to_backend():
+    global conversation
+    question = request.get_json().get("userMsg")
+    print("Question: ", question)
+    url = "http://119.63.132.178:8080/chat"
+    response = ""
+    try:
+        # Make a GET request to the API
+        ques = {'text':question}
+        print("ques",ques)
+        response = requests.post(url,json=ques)
+    except:
+        response = "couldn't get response!"
+    if response != "couldn't get response!":
+        llamaAnswer = response.json().get('Answer')
+    else:
+        llamaAnswer = response
+    # Call your backend function or API here, and replace the following lines with your actual logic
+    """
+    'question': "Respond to the Input in an appropriate manner while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
+    'question': "Respond to the Input while following the formatting instructions. \n Input:"+question+" \n formatting instructions:enclose the answer or response to the input in <p> html tags and add other styling using tags to this <p> element, where appropriate, if you can."
+    """
+    try:
+        response_content = conversation({'question': question+f" (You can use this additional context in addition to the context\n Additional-Context: {llamaAnswer})"})
+    except:
+        print("conversation chain limit exceeded")
+        text_chunks = ""
+        vectorstore = get_vectorstore(text_chunks)
+        conversation = get_conversation_chain(vectorstore)
+        response_content = conversation({'question': question+f" (You can use this additional context in addition to the context \n Additional-Context: {llamaAnswer})"})
+    #response_message = markdown.markdown(response_content.get('answer'))
+    response_message = response_content.get('answer')
+    response_context = response_content.get('source_documents')
+    #P, R, F1 = score([response_message], [str(response_context)],lang="en")
+    documents = [response_message,str(response_context)]
+    F1 = cosine_similarity(TfidfVectorizer().fit_transform(documents), TfidfVectorizer().fit_transform(documents))
+    F1 = (F1[0][1]+0.3) / (np.linalg.norm(F1[0]))
+    #print("context: "+str(response_context))
+    print("answer: " + response_message)
+    print("llamaanswer: "+llamaAnswer)
+    #If the 'Context' does not contain relevant information then respond to the 'Input' in any appropriate manner."
+    #Formatting and combine responses
+    # prompt = [{"role": "user", "content": f"""
+    #            Respond to the given 'Input' using the provided contexts 'Context1' and 'Context2' respectivley. Prefer to include information from 'Context1' in your response. For inputs such as "Hey," "Hi," or "Can you help me," or any other greetings in any languages respond from 'Context1'. Do not use phrase like "from the context provided" and do not refer to 'Context1', 'Context2' in any way in your response. Your response should be formatted using HTML tags for improved readability.</p>
+    #             Input: [{question}] \n
+    #             Context1: [{response_message}] \n
+    #             Context2:[{llamaAnswer}]
+    #            """}]
+    prompt = f"""
+Use HTML formatting to make the text labelled 'Text' to improve readibility. Do not refer to 'Text' in you response.\n
+            Text: {response_message}
+            """
+    response = openai.Completion.create(
+        engine="gpt-3.5-turbo-instruct",  # GPT-3 Base model
+        prompt=prompt,
+        max_tokens = 500 # You can adjust this based on your desired question length
+    )
+    finalAnswer = markdown.markdown(response.choices[0].text.strip())
+    print("final Answer:", finalAnswer)
+    return jsonify({"response": finalAnswer+"""<br><p style="color: yellow; text-align: right;font-style: italic; font-size: 14px;margin-bottom: 0;">RAGAS SCORE: """+str(F1)+"""</p>"""})
+if __name__ == '__main__':
+    load_dotenv()
+    if not spacy.util.is_package("en_core_web_sm"):
+        # If not installed, download and install the model
+        spacy.cli.download("en_core_web_sm")
+    print("I'm here")
+    pdf_docs = ["totaltax.pdf"]
+    raw_text = get_pdf_text(pdf_docs)
+    # #text_chunkslist = get_text_chunks(raw_text)
+    raw_text1 = raw_text[0:999999]
+    raw_text2=raw_text[999000:]
+    text_chunks1 = get_text_chunks(raw_text1)
+    text_chunks2=get_text_chunks(raw_text2)
+    text_chunkslist = text_chunks1+text_chunks2
+    text_chunks=[]
+    for chunk in text_chunkslist:
+        textelem = str(chunk)
+        textelem = textelem[1:len(textelem)-2]
+        text_chunks.append(textelem)
+    print("I'm here 1")
+    # text_chunks = ""
+    vectorstore = get_vectorstore(text_chunks)
+    conversation = get_conversation_chain(vectorstore)
+    print("I'm here 2")
+    """
+    questions = []
+    answers = []
+    generated_answers = []
+    contexts=[]
+    questioncontext = {}
+    excel_file_path = 'MMRRetriever\MistralT0.9MMR.xlsx'
+    text_file_path = 'output.txt'
+    df = pd.read_excel(excel_file_path)
+    questions = df['question'].tolist()
+    answers = df['ground_truths'].tolist()
+    """
+    """
+    with open(text_file_path, 'r') as file:
+        file_content = file.read()
+    generated_answers = ast.literal_eval(file_content)
+    n = len(generated_answers)
+    questions = questions[n:]
+    """
+    """
+    for question in questions:
+        try:
+            response = conversation({'question': str(question)})
+        except:
+            conversation = get_conversation_chain(vectorstore)
+            response = conversation({'question': question})
+        print(len(response['answer']))
+        generated_answers.append(str(response['answer']))
+        print(len(str(response['source_documents'])))
+        contexts.append(str(response['source_documents']))
+        with open(text_file_path, 'w') as file:
+            file.write(str(generated_answers))
+    print(len(generated_answers))
+    while len(contexts) != len(generated_answers):
+        contexts.append("")
+    df["context"] = contexts
+    df["generated_answer"] = generated_answers
+    P, R, F1 = score(generated_answers, answers,lang="en")
+    F1array = list(F1)
+    df["Bert Score"] = F1array
+    combinedScore = F1.mean()
+    print(combinedScore)
+    #F1array.append(combinedScore)
+    df["Bert Score"] = F1array
+    df.to_excel(excel_file_path, index=False)
+    """
+    print("I'm done")
+    app.run(port=3000)

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import openai
+import streamlit as st
+import subprocess
+import requests
+st.title("Tax Tajweez")
+# Initialize session state if it doesn't exist
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display previous chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"], unsafe_allow_html=True)
+# Get user input
+if prompt := st.chat_input("Ask me anything related to income tax..."):
+    # Add user message to session state
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Get assistant response
+    with st.expander("Assistant Response", expanded=True):
+        with st.spinner("I'm thinking..."):
+            # Define the URL of the API endpoint
+            url = "http://localhost:3000/send_to_backend"
+            # Define the data you want to send in the request body
+            data = {"userMsg": prompt}
+            # Make the POST request
+            response = requests.post(url, json=data)
+            # Check if the request was successful (status code 200)
+            if response.status_code == 200:
+                # Render the assistant response with markdown and allow HTML
+                assistant_response = (response.json())['response']
+                if assistant_response not in [msg.get("content") for msg in st.session_state.messages if msg.get("role") == "assistant"]:
+                    st.markdown(assistant_response, unsafe_allow_html=True)
+                    # Add assistant's response to session state
+                    st.session_state.messages.append({"role": "assistant", "content": assistant_response})
+            else:
+                st.error(f"Error: {response.status_code}")
+# Specify the path to the Python file you want to run
+file_path = 'api.py'
+# Run the Python file
+subprocess.run(['python',file_path])

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+flask
+flask-cors
+python-dotenv
+pypdf2
+pydantic
+pandas
+langchain==0.0.345
+faiss-cpu==1.7.4
+openai==0.28.0
+huggingface_hub
+sentence_transformers
+semantic-split==0.1.0
+tiktoken
+cohere
+spacy==3.7.2
+markdown2
+markdown