|
import pinecone |
|
from langchain.vectorstores import Pinecone |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.chains import ConversationalRetrievalChain |
|
import os |
|
from langchain.docstore.document import Document |
|
from langchain.document_loaders import PyPDFLoader |
|
import gradio as gr |
|
import openai |
|
import backoff |
|
|
|
openai.api_key = os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
pinecone.init(api_key=os.environ['PINECONE_API_KEY'], |
|
environment='us-east1-gcp') |
|
|
|
index = pinecone.Index('asesura') |
|
delete_response = index.delete(delete_all=True) |
|
|
|
|
|
loader = PyPDFLoader("SaludClasico2023.pdf") |
|
pages = loader.load_and_split() |
|
|
|
n_pages = [page.page_content.replace("CÓDIGO CLAUSULADO - 01/01/2023 - 1411 - P - 35 - F-14-11-0090-214- D00I. CÓDIGO NOTA TÉCNICA – 01/01/2023 - 1411 - NT-P - 35 - N-14-11-0090-031", "") for page in pages] |
|
print(len(n_pages)) |
|
|
|
for i in range(len(n_pages)-1): |
|
if len(n_pages[i]) < 100: |
|
n_pages[i+1] = n_pages[i] + n_pages[i+1] |
|
n_pages[i] = "" |
|
|
|
n_pages = [page for page in n_pages if len(page) > 100] |
|
|
|
pages_summarized = [] |
|
|
|
for i, chunk in enumerate(n_pages): |
|
|
|
messages = [ |
|
{"role": "system", "content": "Summarize this text in the same language as the user's input."}] |
|
messages.append({"role": "user", "content": chunk}) |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=messages, |
|
temperature=.2, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
pages_summarized.append( |
|
response["choices"][0]["message"]['content'].strip()) |
|
|
|
pages = [Document(page_content = page_summarized, metadata={"page": i}) for i, page_summarized in enumerate(pages_summarized)] |
|
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
documents = text_splitter.split_documents(pages) |
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
|
|
pinecone.init( |
|
api_key="15f4e36a-00e9-46ad-8dcb-f01e5b2f7568", |
|
environment="us-east1-gcp" |
|
) |
|
|
|
index_name = "asesura" |
|
|
|
|
|
docsearch = Pinecone.from_documents( |
|
documents, embeddings, index_name=index_name) |
|
|
|
|
|
qa = ConversationalRetrievalChain.from_llm(ChatOpenAI( |
|
temperature=0), docsearch.as_retriever(), return_source_documents=True) |
|
|
|
|
|
@backoff.on_exception(backoff.expo, openai.error.RateLimitError) |
|
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError) |
|
def predict(query): |
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=[{"role": "system", "content": "Change the user's question so it is correctly made, with the correct punctuation, grammar, and spelling. The question is being made to an insurance policy called 'Sura Seguro de Salud Clásico' by a company named Sura. Return the question in the same language as the user."}, |
|
{"role": "user", "content": query}], |
|
temperature=0.1) |
|
refined_q = response.choices[0].message.content |
|
print(refined_q) |
|
response = qa( |
|
{"question": refined_q, "chat_history": []})['answer'] |
|
return response |
|
|
|
|
|
pregunta = gr.Textbox(label="Pregunta", placeholder="Escribe tu pregunta") |
|
respuesta = gr.Textbox( |
|
label="Respuesta") |
|
|
|
gr.Interface(predict, pregunta, respuesta, title="Asesura", |
|
description="Bienvenido a tu asesor personal de seguros Sura. Pregunta lo que necesites saber sobre el Seguro de Salud Clásico de Sura 2023").launch() |
|
|