File size: 3,981 Bytes
853a403
 
 
 
 
 
 
 
 
dce74f9
2ddc577
853a403
2ddc577
 
853a403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dce74f9
853a403
 
 
dce74f9
853a403
dce74f9
 
 
 
 
853a403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dce74f9
853a403
 
 
 
 
 
 
 
 
 
 
 
 
dce74f9
 
 
 
 
 
cb97b3c
 
dce74f9
853a403
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from PyPDF2 import PdfReader
# import pdfplumber
from tqdm import tqdm
import tiktoken
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import openai
import streamlit as st
import gradio as gr
from gradio.components import Textbox, Slider
import os

# take as env variable called OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI")

# write some python constants for file name, paragraph length, overlapping length:
file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
paragraph_length = 100
overlapping_length = 50
db = None

from PyPDF2 import PdfReader


def load_pdf(file_path):
    print("load pdf")
    reader = PdfReader(file_path)
    # concatenate all pages
    text = ''
    for page in tqdm(reader.pages):
        text += page.extract_text()
    return text


def extract_text_with_format(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in tqdm(pdf.pages):
            text += page.extract_text()
    return text


from collections import deque


def split_text(text, paragraph_length, overlapping_length):
    enc = tiktoken.get_encoding("cl100k_base")
    enc = tiktoken.encoding_for_model("gpt-4")

    def get_len(tokens):
        return len(tokens)

    def tokens_to_text(tokens):
        return enc.decode(tokens)

    # split text so each item is max paragraph length and overlap is overlapping length
    splitted_text = []
    tokens = enc.encode(text)

    i = 0
    while i < len(tokens):
        start = max(i - overlapping_length, 0)
        end = i + paragraph_length
        splitted_text.append(tokens_to_text(tokens[start:end]))
        i += paragraph_length

    return splitted_text


def save_in_DB(splitted_text):
    # Create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_texts(splitted_text, embedding_function)
    print("Data saved successfully!")
    print("type db", type(db))
    return db


def query(query_text, num_docs):
    st.title('RAG system')

    # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
    docs = db.similarity_search(query_text, k=num_docs)
    print("len(docs)", len(docs))
    # print each docs .page_content with klar abgrenzen
    for doc in docs:
        print("doc", doc.page_content)
        print()
        print()

    # Store the first 10 results as context
    context = '\n\n'.join([doc.page_content for doc in docs[:5]])
    # show context in streamlit with subheader
    """st.subheader("Context:")
    st.write(context)"""
    instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"

    # Make an OpenAI request with the given context and query
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # or any other model you're targeting
        messages=[
            {"role": "user", "content": instruct}
            ],
        max_tokens=150
        )

    # Extract the generated answer
    predicted = completion.choices[0].message["content"]

    # Return the generated answer
    st.subheader("Answer:")
    st.write(predicted)
    return predicted



def run():
    global db
    print("run app")
    text = load_pdf(file_path)
    # text = extract_text_with_format(file_path)
    splitted_text = split_text(text, paragraph_length, overlapping_length)
    print("num splitted text", len(splitted_text))
    db = save_in_DB(splitted_text)
    print("type db", type(db))

    demo = gr.Interface(
        fn=query,
        inputs=[
            Textbox(lines=1, placeholder="Type your question here...", label="Question"),
            Slider(minimum=1, maximum=20, default=4, step=1, label="Number of Documents in Context")
            ],
        outputs="text",
        theme="dark"
        )

    demo.launch()
    # query(db)

run()