vr18 commited on
Commit
e9f0b55
β€’
1 Parent(s): 853a403

Delete src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +0 -122
src/app.py DELETED
@@ -1,122 +0,0 @@
1
- from PyPDF2 import PdfReader
2
- # import pdfplumber
3
- from tqdm import tqdm
4
- import tiktoken
5
- from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
- from langchain.vectorstores import Chroma
7
- import openai
8
- import streamlit as st
9
- import gradio as gr
10
-
11
- openai.api_key = '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'
12
-
13
- # write some python constants for file name, paragraph length, overlapping length:
14
- file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
15
- paragraph_length = 100
16
- overlapping_length = 50
17
- db = None
18
-
19
- from PyPDF2 import PdfReader
20
-
21
-
22
- def load_pdf(file_path):
23
- print("load pdf")
24
- reader = PdfReader(file_path)
25
- # concatenate all pages
26
- text = ''
27
- for page in tqdm(reader.pages):
28
- text += page.extract_text()
29
- return text
30
-
31
-
32
- def extract_text_with_format(pdf_path):
33
- with pdfplumber.open(pdf_path) as pdf:
34
- text = ''
35
- for page in tqdm(pdf.pages):
36
- text += page.extract_text()
37
- return text
38
-
39
-
40
- from collections import deque
41
-
42
-
43
- def split_text(text, paragraph_length, overlapping_length):
44
- enc = tiktoken.get_encoding("cl100k_base")
45
- enc = tiktoken.encoding_for_model("gpt-4")
46
-
47
- def get_len(tokens):
48
- return len(tokens)
49
-
50
- def tokens_to_text(tokens):
51
- return enc.decode(tokens)
52
-
53
- # split text so each item is max paragraph length and overlap is overlapping length
54
- splitted_text = []
55
- tokens = enc.encode(text)
56
-
57
- i = 0
58
- while i < len(tokens):
59
- start = max(i - overlapping_length, 0)
60
- end = i + paragraph_length
61
- splitted_text.append(tokens_to_text(tokens[start:end]))
62
- i += paragraph_length
63
-
64
- return splitted_text
65
-
66
-
67
- def save_in_DB(splitted_text):
68
- # Create the open-source embedding function
69
- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
70
- db = Chroma.from_texts(splitted_text, embedding_function)
71
- print("Data saved successfully!")
72
- print("type db", type(db))
73
- return db
74
-
75
-
76
- def query(query_text):
77
- st.title('RAG system')
78
-
79
- # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
80
- docs = db.similarity_search(query_text)
81
- print("len(docs)", len(docs))
82
-
83
- # Store the first 10 results as context
84
- context = '\n\n'.join([doc.page_content for doc in docs[:5]])
85
- # show context in streamlit with subheader
86
- """st.subheader("Context:")
87
- st.write(context)"""
88
- instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
89
-
90
- # Make an OpenAI request with the given context and query
91
- completion = openai.ChatCompletion.create(
92
- model="gpt-3.5-turbo", # or any other model you're targeting
93
- messages=[
94
- {"role": "user", "content": instruct}
95
- ],
96
- max_tokens=150
97
- )
98
-
99
- # Extract the generated answer
100
- predicted = completion.choices[0].message["content"]
101
-
102
- # Return the generated answer
103
- st.subheader("Answer:")
104
- st.write(predicted)
105
- return predicted, context
106
-
107
-
108
-
109
- def run():
110
- global db
111
- print("run app")
112
- text = load_pdf(file_path)
113
- # text = extract_text_with_format(file_path)
114
- splitted_text = split_text(text, paragraph_length, overlapping_length)
115
- print("num splitted text", len(splitted_text))
116
- db = save_in_DB(splitted_text)
117
- print("type db", type(db))
118
-
119
- demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
120
-
121
- demo.launch()
122
- # query(db)