File size: 3,981 Bytes
853a403 dce74f9 2ddc577 853a403 2ddc577 853a403 dce74f9 853a403 dce74f9 853a403 dce74f9 853a403 dce74f9 853a403 dce74f9 cb97b3c dce74f9 853a403 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from PyPDF2 import PdfReader
# import pdfplumber
from tqdm import tqdm
import tiktoken
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import openai
import streamlit as st
import gradio as gr
from gradio.components import Textbox, Slider
import os
# take as env variable called OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI")
# write some python constants for file name, paragraph length, overlapping length:
file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
paragraph_length = 100
overlapping_length = 50
db = None
from PyPDF2 import PdfReader
def load_pdf(file_path):
print("load pdf")
reader = PdfReader(file_path)
# concatenate all pages
text = ''
for page in tqdm(reader.pages):
text += page.extract_text()
return text
def extract_text_with_format(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ''
for page in tqdm(pdf.pages):
text += page.extract_text()
return text
from collections import deque
def split_text(text, paragraph_length, overlapping_length):
enc = tiktoken.get_encoding("cl100k_base")
enc = tiktoken.encoding_for_model("gpt-4")
def get_len(tokens):
return len(tokens)
def tokens_to_text(tokens):
return enc.decode(tokens)
# split text so each item is max paragraph length and overlap is overlapping length
splitted_text = []
tokens = enc.encode(text)
i = 0
while i < len(tokens):
start = max(i - overlapping_length, 0)
end = i + paragraph_length
splitted_text.append(tokens_to_text(tokens[start:end]))
i += paragraph_length
return splitted_text
def save_in_DB(splitted_text):
# Create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_texts(splitted_text, embedding_function)
print("Data saved successfully!")
print("type db", type(db))
return db
def query(query_text, num_docs):
st.title('RAG system')
# query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
docs = db.similarity_search(query_text, k=num_docs)
print("len(docs)", len(docs))
# print each docs .page_content with klar abgrenzen
for doc in docs:
print("doc", doc.page_content)
print()
print()
# Store the first 10 results as context
context = '\n\n'.join([doc.page_content for doc in docs[:5]])
# show context in streamlit with subheader
"""st.subheader("Context:")
st.write(context)"""
instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
# Make an OpenAI request with the given context and query
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo", # or any other model you're targeting
messages=[
{"role": "user", "content": instruct}
],
max_tokens=150
)
# Extract the generated answer
predicted = completion.choices[0].message["content"]
# Return the generated answer
st.subheader("Answer:")
st.write(predicted)
return predicted
def run():
global db
print("run app")
text = load_pdf(file_path)
# text = extract_text_with_format(file_path)
splitted_text = split_text(text, paragraph_length, overlapping_length)
print("num splitted text", len(splitted_text))
db = save_in_DB(splitted_text)
print("type db", type(db))
demo = gr.Interface(
fn=query,
inputs=[
Textbox(lines=1, placeholder="Type your question here...", label="Question"),
Slider(minimum=1, maximum=20, default=4, step=1, label="Number of Documents in Context")
],
outputs="text",
theme="dark"
)
demo.launch()
# query(db)
run() |