from PyPDF2 import PdfReader # import pdfplumber from tqdm import tqdm import tiktoken from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.vectorstores import Chroma import openai import streamlit as st import gradio as gr from gradio.components import Textbox, Slider import os # take as env variable called OPENAI_API_KEY openai.api_key = os.getenv("OPENAI") # write some python constants for file name, paragraph length, overlapping length: file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf" paragraph_length = 100 overlapping_length = 50 db = None from PyPDF2 import PdfReader def load_pdf(file_path): print("load pdf") reader = PdfReader(file_path) # concatenate all pages text = '' for page in tqdm(reader.pages): text += page.extract_text() return text def extract_text_with_format(pdf_path): with pdfplumber.open(pdf_path) as pdf: text = '' for page in tqdm(pdf.pages): text += page.extract_text() return text from collections import deque def split_text(text, paragraph_length, overlapping_length): enc = tiktoken.get_encoding("cl100k_base") enc = tiktoken.encoding_for_model("gpt-4") def get_len(tokens): return len(tokens) def tokens_to_text(tokens): return enc.decode(tokens) # split text so each item is max paragraph length and overlap is overlapping length splitted_text = [] tokens = enc.encode(text) i = 0 while i < len(tokens): start = max(i - overlapping_length, 0) end = i + paragraph_length splitted_text.append(tokens_to_text(tokens[start:end])) i += paragraph_length return splitted_text def save_in_DB(splitted_text): # Create the open-source embedding function embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") db = Chroma.from_texts(splitted_text, embedding_function) print("Data saved successfully!") print("type db", type(db)) return db def query(query_text, num_docs): st.title('RAG system') # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question") docs = db.similarity_search(query_text, k=num_docs) print("len(docs)", len(docs)) # print each docs .page_content with klar abgrenzen for doc in docs: print("doc", doc.page_content) print() print() # Store the first 10 results as context context = '\n\n'.join([doc.page_content for doc in docs[:5]]) # show context in streamlit with subheader """st.subheader("Context:") st.write(context)""" instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:" # Make an OpenAI request with the given context and query completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", # or any other model you're targeting messages=[ {"role": "user", "content": instruct} ], max_tokens=150 ) # Extract the generated answer predicted = completion.choices[0].message["content"] # Return the generated answer st.subheader("Answer:") st.write(predicted) return predicted def run(): global db print("run app") text = load_pdf(file_path) # text = extract_text_with_format(file_path) splitted_text = split_text(text, paragraph_length, overlapping_length) print("num splitted text", len(splitted_text)) db = save_in_DB(splitted_text) print("type db", type(db)) demo = gr.Interface( fn=query, inputs=[ Textbox(lines=1, placeholder="Type your question here...", label="Question"), Slider(minimum=1, maximum=20, default=4, step=1, label="Number of Documents in Context") ], outputs="text", theme="dark" ) demo.launch() # query(db) run()