# from langchain_openai import OpenAI from langchain_experimental.agents import create_pandas_dataframe_agent import pandas as pd from dotenv import load_dotenv import os import json from langchain_openai import ChatOpenAI from langchain.text_splitter import CharacterTextSplitter from langchain_openai import OpenAIEmbeddings import weaviate from langchain_community.vectorstores import Weaviate from weaviate.embedded import EmbeddedOptions from langchain.prompts import ChatPromptTemplate from langchain.document_loaders.pdf import PyPDFLoader from langchain.schema.runnable import RunnablePassthrough from langchain.schema.output_parser import StrOutputParser import gradio as gr # from langchain_community.llms import ctransformers # from ctransformers import AutoModelForCausalLM load_dotenv() API_KEY=os.getenv("OPENAI_API_KEY") TEMP_DIR = "../temp" # pdf def agent(filename: str): llm = ChatOpenAI( model = "gpt-3.5-turbo-0125", # model = "gpt-4", temperature = 0.0, # max_tokens = 256, # top_p = 0.5, ) df = pd.read_csv(filename, encoding='unicode_escape') pandas_df_agent = create_pandas_dataframe_agent(llm, df, verbose=True) return pandas_df_agent def get_response(agent, query): prompt = ( """ For the following query, if it requires drawing a table, reply as follows: {"table": {"columns": ["column1", "column2", ...], "data": [[value1, value2, ...], [value1, value2, ...], ...]}} If the query requires creating a bar chart, reply as follows: {"bar": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}} If the query requires creating a line chart, reply as follows: {"line": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}} There can only be two types of charts, "bar" and "line". If it is just asking a question that requires neither, reply as follows: {"answer": "answer"} Example: {"answer": "The product with the highest sales is 'Classic Cars.'"} Write supportive numbers if there are any in the answer. Example: {"answer": "The product with the highest sales is 'Classic Cars' with 1111 sales."} If you do not know the answer, reply as follows: {"answer": "I do not know."} Do not hallucinate or make up data. If the data is not available, reply "I do not know." Return all output as a string in double quotes. All strings in "columns" list and data list, should be in double quotes, For example: {"columns": ["title", "ratings_count"], "data": [["Gilead", 361], ["Spider's Web", 5164]]} Lets think step by step. Below is the query. Query: """ + query ) response = agent.run(prompt) return response.__str__() def return_response(response: str) -> dict: try: return json.loads(response) except json.JSONDecodeError as e: print(f"JSONDecodeError: {e}") return None def write_response(response_dict: dict): if response_dict is not None: if "answer" in response_dict: answer = response_dict["answer"] # st.write(answer) return answer if "bar" in response_dict: data = response_dict["bar"] df = pd.DataFrame.from_dict(data, orient = 'index') df = df.transpose() df.set_index("columns", inplace=True) # st.bar_chart(df) return gr.BarPlot(df) if "line" in response_dict: data = response_dict["line"] df = pd.DataFrame(data) df.set_index("columns", inplace=True) # st.line_chart(df) return gr.LinePlot(df) # if "table" in response_dict: # data = response_dict["table"] # df = pd.DataFrame(data["data"], columns=data["columns"]) # # st.table(df) else: answer = "Decoded response is None. Please retry with a better prompt." return (answer) def ques_csv(data, question: str): csv_agent = agent(data) response = get_response(agent = csv_agent, query = question) decoded_response = return_response(response) answer = write_response(decoded_response) return answer # pdf def ques_pdf(data, question: str): doc = load_pdf(data) chunks = split_pdf(doc) retriever = store_retrieve(chunks) prompt = write_prompt() answer = ques_llm(retriever, prompt, question) # st.write(answer) return answer def make_dir(): if not os.path.exists(TEMP_DIR): os.makedirs(TEMP_DIR) def upload(uploaded_file): if uploaded_file is not None: file_path = os.path.join(TEMP_DIR, uploaded_file.name) with open(file_path, "wb") as f: f.write(uploaded_file.getvalue()) return file_path def load_pdf(filename: str): loader = PyPDFLoader("{}".format(filename)) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) pages = loader.load_and_split(text_splitter = text_splitter) return pages def split_pdf(doc): text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(doc) return chunks def store_retrieve(chunks): client = weaviate.Client( embedded_options = EmbeddedOptions() ) vectorstore = Weaviate.from_documents( client = client, documents = chunks, embedding = OpenAIEmbeddings(), by_text = False ) retriever = vectorstore.as_retriever() return retriever def write_prompt(): template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Question: {question} Context: {context} Answer: """ prompt = ChatPromptTemplate.from_template(template) return prompt def ques_llm(retriever, prompt, question): llm = ChatOpenAI(model_name="gpt-4", temperature=0) # # llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin", temperature=0) # llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q4_0.bin", temperature=0) rag_chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) ans = rag_chain.invoke(question) return ans