Spaces:

adityaagrawal
/

rag-assignment

Runtime error

File size: 6,724 Bytes

7c0f544

# from langchain_openai import OpenAI
from langchain_experimental.agents import create_pandas_dataframe_agent
import pandas as pd
from dotenv import load_dotenv
import os
import json
from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import weaviate
from langchain_community.vectorstores import Weaviate
from weaviate.embedded import EmbeddedOptions
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders.pdf import PyPDFLoader
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import gradio as gr
# from langchain_community.llms import ctransformers
# from ctransformers import AutoModelForCausalLM

load_dotenv()
API_KEY=os.getenv("OPENAI_API_KEY")
TEMP_DIR = "../temp"

# pdf
def agent(filename: str):

    llm = ChatOpenAI(
        model = "gpt-3.5-turbo-0125",
        # model = "gpt-4",
        temperature = 0.0,
        # max_tokens = 256,
        # top_p = 0.5,
    )
    df = pd.read_csv(filename, encoding='unicode_escape')
    pandas_df_agent = create_pandas_dataframe_agent(llm, df, verbose=True)
    
    return pandas_df_agent

def get_response(agent, query):
    prompt = (
        """
            For the following query, if it requires drawing a table, reply as follows:
            {"table": {"columns": ["column1", "column2", ...], "data": [[value1, value2, ...], [value1, value2, ...], ...]}}

            If the query requires creating a bar chart, reply as follows:
            {"bar": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}}

            If the query requires creating a line chart, reply as follows:
            {"line": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}}

            There can only be two types of charts, "bar" and "line".

            If it is just asking a question that requires neither, reply as follows:
            {"answer": "answer"}
            Example:
            {"answer": "The product with the highest sales is 'Classic Cars.'"}

            Write supportive numbers if there are any in the answer.
            Example:
            {"answer": "The product with the highest sales is 'Classic Cars' with 1111 sales."}

            If you do not know the answer, reply as follows:
            {"answer": "I do not know."}

            Do not hallucinate or make up data. If the data is not available, reply "I do not know."
            
            Return all output as a string in double quotes. 

            All strings in "columns" list and data list, should be in double quotes,

            For example: {"columns": ["title", "ratings_count"], "data": [["Gilead", 361], ["Spider's Web", 5164]]}

            Lets think step by step.

            Below is the query.
            Query: 
            """
        + query
    )

    response = agent.run(prompt)
    return response.__str__()

def return_response(response: str) -> dict: 
    try:
        return json.loads(response)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        return None

def write_response(response_dict: dict):
    if response_dict is not None:
        if "answer" in response_dict:
            answer = response_dict["answer"]
            # st.write(answer)
            return answer

        if "bar" in response_dict:
            data = response_dict["bar"]
            df = pd.DataFrame.from_dict(data, orient = 'index')
            df = df.transpose()
            df.set_index("columns", inplace=True)
            # st.bar_chart(df)
            return gr.BarPlot(df)

        if "line" in response_dict:
            data = response_dict["line"]
            df = pd.DataFrame(data)
            df.set_index("columns", inplace=True)
            # st.line_chart(df)
            return gr.LinePlot(df)

        # if "table" in response_dict:
        #     data = response_dict["table"]
        #     df = pd.DataFrame(data["data"], columns=data["columns"])
        #     # st.table(df)
            

    else:
        answer = "Decoded response is None. Please retry with a better prompt."
        return (answer)

def ques_csv(data, question: str):
    csv_agent = agent(data)
    response = get_response(agent = csv_agent, query = question)
    decoded_response = return_response(response)
    answer = write_response(decoded_response)
    return answer

# pdf
def ques_pdf(data, question: str):
    doc = load_pdf(data)
    chunks = split_pdf(doc)
    retriever = store_retrieve(chunks)
    prompt = write_prompt()
    answer = ques_llm(retriever, prompt, question)
    # st.write(answer)
    return answer

def make_dir():
    if not os.path.exists(TEMP_DIR):
        os.makedirs(TEMP_DIR)

def upload(uploaded_file):
    if uploaded_file is not None:
        file_path = os.path.join(TEMP_DIR, uploaded_file.name)
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getvalue())

    return file_path

def load_pdf(filename: str):
    loader = PyPDFLoader("{}".format(filename))
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
    pages = loader.load_and_split(text_splitter = text_splitter)
    return pages

def split_pdf(doc):
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(doc)
    return chunks

def store_retrieve(chunks):
    client = weaviate.Client(
    embedded_options = EmbeddedOptions()
    )
    vectorstore = Weaviate.from_documents(
        client = client,    
        documents = chunks,
        embedding = OpenAIEmbeddings(),
        by_text = False
    )
    retriever = vectorstore.as_retriever()
    return retriever

def write_prompt():
    template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Question: {question} 
    Context: {context} 
    Answer:
    """
    prompt = ChatPromptTemplate.from_template(template) 
    return prompt

def ques_llm(retriever, prompt, question):
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    # # llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin", temperature=0)
    # llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q4_0.bin", temperature=0)
    rag_chain = (
        {"context": retriever,  "question": RunnablePassthrough()} 
        | prompt 
        | llm
        | StrOutputParser() 
    )
    ans = rag_chain.invoke(question)
    return ans