vr18 commited on
Commit
853a403
β€’
1 Parent(s): 93d6074

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ # import pdfplumber
3
+ from tqdm import tqdm
4
+ import tiktoken
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ import openai
8
+ import streamlit as st
9
+ import gradio as gr
10
+
11
+ openai.api_key = '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'
12
+
13
+ # write some python constants for file name, paragraph length, overlapping length:
14
+ file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
15
+ paragraph_length = 100
16
+ overlapping_length = 50
17
+ db = None
18
+
19
+ from PyPDF2 import PdfReader
20
+
21
+
22
+ def load_pdf(file_path):
23
+ print("load pdf")
24
+ reader = PdfReader(file_path)
25
+ # concatenate all pages
26
+ text = ''
27
+ for page in tqdm(reader.pages):
28
+ text += page.extract_text()
29
+ return text
30
+
31
+
32
+ def extract_text_with_format(pdf_path):
33
+ with pdfplumber.open(pdf_path) as pdf:
34
+ text = ''
35
+ for page in tqdm(pdf.pages):
36
+ text += page.extract_text()
37
+ return text
38
+
39
+
40
+ from collections import deque
41
+
42
+
43
+ def split_text(text, paragraph_length, overlapping_length):
44
+ enc = tiktoken.get_encoding("cl100k_base")
45
+ enc = tiktoken.encoding_for_model("gpt-4")
46
+
47
+ def get_len(tokens):
48
+ return len(tokens)
49
+
50
+ def tokens_to_text(tokens):
51
+ return enc.decode(tokens)
52
+
53
+ # split text so each item is max paragraph length and overlap is overlapping length
54
+ splitted_text = []
55
+ tokens = enc.encode(text)
56
+
57
+ i = 0
58
+ while i < len(tokens):
59
+ start = max(i - overlapping_length, 0)
60
+ end = i + paragraph_length
61
+ splitted_text.append(tokens_to_text(tokens[start:end]))
62
+ i += paragraph_length
63
+
64
+ return splitted_text
65
+
66
+
67
+ def save_in_DB(splitted_text):
68
+ # Create the open-source embedding function
69
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
70
+ db = Chroma.from_texts(splitted_text, embedding_function)
71
+ print("Data saved successfully!")
72
+ print("type db", type(db))
73
+ return db
74
+
75
+
76
+ def query(query_text):
77
+ st.title('RAG system')
78
+
79
+ # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
80
+ docs = db.similarity_search(query_text)
81
+ print("len(docs)", len(docs))
82
+
83
+ # Store the first 10 results as context
84
+ context = '\n\n'.join([doc.page_content for doc in docs[:5]])
85
+ # show context in streamlit with subheader
86
+ """st.subheader("Context:")
87
+ st.write(context)"""
88
+ instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
89
+
90
+ # Make an OpenAI request with the given context and query
91
+ completion = openai.ChatCompletion.create(
92
+ model="gpt-3.5-turbo", # or any other model you're targeting
93
+ messages=[
94
+ {"role": "user", "content": instruct}
95
+ ],
96
+ max_tokens=150
97
+ )
98
+
99
+ # Extract the generated answer
100
+ predicted = completion.choices[0].message["content"]
101
+
102
+ # Return the generated answer
103
+ st.subheader("Answer:")
104
+ st.write(predicted)
105
+ return predicted, context
106
+
107
+
108
+
109
+ def run():
110
+ global db
111
+ print("run app")
112
+ text = load_pdf(file_path)
113
+ # text = extract_text_with_format(file_path)
114
+ splitted_text = split_text(text, paragraph_length, overlapping_length)
115
+ print("num splitted text", len(splitted_text))
116
+ db = save_in_DB(splitted_text)
117
+ print("type db", type(db))
118
+
119
+ demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
120
+
121
+ demo.launch()
122
+ # query(db)
123
+
124
+ run()