rag-assignment / utils.py
adityaagrawal's picture
Update
69fbc52 verified
# from langchain_openai import OpenAI
from langchain_experimental.agents import create_pandas_dataframe_agent
import pandas as pd
from dotenv import load_dotenv
import os
import json
from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import weaviate
from langchain_community.vectorstores import Weaviate
from weaviate.embedded import EmbeddedOptions
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders.pdf import PyPDFLoader
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import gradio as gr
# from langchain_community.llms import ctransformers
# from ctransformers import AutoModelForCausalLM
load_dotenv()
API_KEY=os.getenv("OPENAI_API_KEY")
TEMP_DIR = "../temp"
# pdf
def agent(filename: str):
llm = ChatOpenAI(
model = "gpt-3.5-turbo-0125",
# model = "gpt-4",
temperature = 0.0,
# max_tokens = 256,
# top_p = 0.5,
)
df = pd.read_csv(filename, encoding='unicode_escape')
pandas_df_agent = create_pandas_dataframe_agent(llm, df, verbose=True)
return pandas_df_agent
def get_response(agent, query):
prompt = (
"""
For the following query, if it requires drawing a table, reply as follows:
{"table": {"columns": ["column1", "column2", ...], "data": [[value1, value2, ...], [value1, value2, ...], ...]}}
If the query requires creating a bar chart, reply as follows:
{"bar": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}}
If the query requires creating a line chart, reply as follows:
{"line": {"columns": ["A", "B", "C", ...], "data": [25, 24, 10, ...]}}
There can only be two types of charts, "bar" and "line".
If it is just asking a question that requires neither, reply as follows:
{"answer": "answer"}
Example:
{"answer": "The product with the highest sales is 'Classic Cars.'"}
Write supportive numbers if there are any in the answer.
Example:
{"answer": "The product with the highest sales is 'Classic Cars' with 1111 sales."}
If you do not know the answer, reply as follows:
{"answer": "I do not know."}
Do not hallucinate or make up data. If the data is not available, reply "I do not know."
Return all output as a string in double quotes.
All strings in "columns" list and data list, should be in double quotes,
For example: {"columns": ["title", "ratings_count"], "data": [["Gilead", 361], ["Spider's Web", 5164]]}
Lets think step by step.
Below is the query.
Query:
"""
+ query
)
response = agent.run(prompt)
return response.__str__()
def return_response(response: str) -> dict:
try:
return json.loads(response)
except json.JSONDecodeError as e:
print(f"JSONDecodeError: {e}")
return None
def write_response(response_dict: dict):
if response_dict is not None:
if "answer" in response_dict:
answer = response_dict["answer"]
# st.write(answer)
return answer
if "bar" in response_dict:
data = response_dict["bar"]
df = pd.DataFrame.from_dict(data, orient = 'index')
df = df.transpose()
df.set_index("columns", inplace=True)
# st.bar_chart(df)
return gr.BarPlot(df)
if "line" in response_dict:
data = response_dict["line"]
df = pd.DataFrame(data)
df.set_index("columns", inplace=True)
# st.line_chart(df)
return gr.LinePlot(df)
# if "table" in response_dict:
# data = response_dict["table"]
# df = pd.DataFrame(data["data"], columns=data["columns"])
# # st.table(df)
else:
answer = "Decoded response is None. Please retry with a better prompt."
return (answer)
def ques_csv(data, question: str):
csv_agent = agent(data)
response = get_response(agent = csv_agent, query = question)
decoded_response = return_response(response)
answer = write_response(decoded_response)
return answer
# pdf
def ques_pdf(data, question: str):
doc = load_pdf(data)
chunks = split_pdf(doc)
retriever = store_retrieve(chunks)
prompt = write_prompt()
answer = ques_llm(retriever, prompt, question)
# st.write(answer)
return answer
def make_dir():
if not os.path.exists(TEMP_DIR):
os.makedirs(TEMP_DIR)
def upload(uploaded_file):
if uploaded_file is not None:
file_path = os.path.join(TEMP_DIR, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getvalue())
return file_path
def load_pdf(filename: str):
loader = PyPDFLoader("{}".format(filename))
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
pages = loader.load_and_split(text_splitter = text_splitter)
return pages
def split_pdf(doc):
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(doc)
return chunks
def store_retrieve(chunks):
client = weaviate.Client(
embedded_options = EmbeddedOptions()
)
vectorstore = Weaviate.from_documents(
client = client,
documents = chunks,
embedding = OpenAIEmbeddings(),
by_text = False
)
retriever = vectorstore.as_retriever()
return retriever
def write_prompt():
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
return prompt
def ques_llm(retriever, prompt, question):
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
# # llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin", temperature=0)
# llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q4_0.bin", temperature=0)
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
ans = rag_chain.invoke(question)
return ans