from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chains import VectorDBQA
from langchain import PromptTemplate
from langchain_openai import OpenAI
from time import time
import pandas as pd
import numpy as np
import getpass
import re
import os
import gradio as gr

#from google.colab import drive
#drive.mount('/content/drive')

final_df = pd.read_csv('petco_rag_df.csv')

# merged_final_df.to_csv('/content/drive/Shareddrives/RAG_SYS_PETCO/data/petco_rag_img_df.csv', index=False)

"""## Vectorstore Setup Chroma

#### Initial setup

you don't need to run this cell; this cell is establishing chromadb
"""

# loader = CSVLoader("/content/drive/Shareddrives/RAG_SYS_PETCO/data/petco_rag_img_df.csv")
# documents = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
# texts = text_splitter.split_documents(documents)
# persist_directory = '/content/drive/Shareddrives/RAG_SYS_PETCO/data/data/chromadb'
# embedding = OpenAIEmbeddings(model='text-embedding-3-large')
# vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)
# vectordb.persist()

"""#### Load predefined chroma"""

persist_directory = '/chromadb'
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
retriever = vectordb.as_retriever(search_type="similarity",search_kwargs={"k":50})

"""## MultiQueryRetriever
"""

from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4-turbo",temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":50}), llm=llm
)

import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

# question = """
# I have promotions for May 2024 for cats and dogs. The promotion is the customers get 30% discount on the select items for cat and dog. # Can you generated an email subject and body where it would result in high CTR from customers? # """ # docs = retriever_from_llm.get_relevant_documents(query=question) from langchain.prompts import PromptTemplate from langchain.chains import LLMChain qa_prompt = PromptTemplate( input_variables=['query','contexts'], template = """ You are a recommendation system that analyze the user's interest and generate an email subject and body for PETCO. If the question cannot be answered using the information provided answer with 'I don't know'. Context: {context} Question: {query}, """, ) qa_chain = LLMChain(llm=llm, prompt=qa_prompt) # out = qa_chain.invoke( # input={ # "query": question, # "context": "\n---\n".join([d.page_content for d in docs]) # } # ) # print(out["text"]) # question = """ # I have promotion for cat and dog. Can you generate a description that can be used for image generation? # """ # docs = retriever_from_llm.get_relevant_documents(query=question) # out = qa_chain.invoke( # input={ # "query": question, # "context": "\n---\n".join([d.page_content for d in docs]) # } # ) # print(out["text"]) def call_rag(question): docs = retriever_from_llm.get_relevant_documents(query=question) out = qa_chain.invoke( input={ "query": question, "context": "\n---\n".join([d.page_content for d in docs]) } ) return out["text"] """# Planning for the Final Presentation [April 26] * Additional data preprocessing * Include image information (GPT generated description of the images in the emails) * Adding customer data such as purchase history or pet info(if available) * Add image generation pipeline * Feed prompt generated from the retriever into Midjourney to generate images * Use actual images from Petco to evaluate the quality of the generated images * Experiments * Prompt engineering with few shot examples * Create question sets to evaluate the robustness of the retrieval model * Explore hyperparameters to adjust quality of the generation * End-to-end pipeline (if possible) * Use **sendgrid** API to automatically send the email from generated subject / body / images from pipeline """ interface = gr.Interface( fn=call_rag, inputs="text", outputs="text", title="Email Generation", description="Enter a prompt and get an example email generated.", ) interface.launch()