vr18 commited on
Commit
967d0d2
β€’
1 Parent(s): d6568ea

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/Hair-Relaxer-Master-Complaint-1.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,2 @@
1
- ---
2
- title: Legal Rag
3
- emoji: πŸŒ–
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.47.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ pip install pypdf2 tiktoken langchain openai chromadb sentence-transformers streamlit
2
+ gradio
 
 
 
 
 
 
 
 
 
 
data/Hair-Relaxer-Master-Complaint-1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e0aa019b6d9bae3d3db63a158150bc5b4a45c749564ef7ddff77c909daf6be0
3
+ size 5619585
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pypdf2
2
+ tiktoken
3
+ langchain
4
+ openai
5
+ chromadb=0.3.29
6
+ sentence-transformers
7
+ streamlit
8
+ gradio
src/app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ # import pdfplumber
3
+ from tqdm import tqdm
4
+ import tiktoken
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ import openai
8
+ import streamlit as st
9
+ import gradio as gr
10
+
11
+ openai.api_key = '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'
12
+
13
+ # write some python constants for file name, paragraph length, overlapping length:
14
+ file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
15
+ paragraph_length = 100
16
+ overlapping_length = 50
17
+ db = None
18
+
19
+ from PyPDF2 import PdfReader
20
+
21
+
22
+ def load_pdf(file_path):
23
+ print("load pdf")
24
+ reader = PdfReader(file_path)
25
+ # concatenate all pages
26
+ text = ''
27
+ for page in tqdm(reader.pages):
28
+ text += page.extract_text()
29
+ return text
30
+
31
+
32
+ def extract_text_with_format(pdf_path):
33
+ with pdfplumber.open(pdf_path) as pdf:
34
+ text = ''
35
+ for page in tqdm(pdf.pages):
36
+ text += page.extract_text()
37
+ return text
38
+
39
+
40
+ from collections import deque
41
+
42
+
43
+ def split_text(text, paragraph_length, overlapping_length):
44
+ enc = tiktoken.get_encoding("cl100k_base")
45
+ enc = tiktoken.encoding_for_model("gpt-4")
46
+
47
+ def get_len(tokens):
48
+ return len(tokens)
49
+
50
+ def tokens_to_text(tokens):
51
+ return enc.decode(tokens)
52
+
53
+ # split text so each item is max paragraph length and overlap is overlapping length
54
+ splitted_text = []
55
+ tokens = enc.encode(text)
56
+
57
+ i = 0
58
+ while i < len(tokens):
59
+ start = max(i - overlapping_length, 0)
60
+ end = i + paragraph_length
61
+ splitted_text.append(tokens_to_text(tokens[start:end]))
62
+ i += paragraph_length
63
+
64
+ return splitted_text
65
+
66
+
67
+ def save_in_DB(splitted_text):
68
+ # Create the open-source embedding function
69
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
70
+ db = Chroma.from_texts(splitted_text, embedding_function)
71
+ print("Data saved successfully!")
72
+ print("type db", type(db))
73
+ return db
74
+
75
+
76
+ def query(query_text):
77
+ st.title('RAG system')
78
+
79
+ # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
80
+ docs = db.similarity_search(query_text)
81
+ print("len(docs)", len(docs))
82
+
83
+ # Store the first 10 results as context
84
+ context = '\n\n'.join([doc.page_content for doc in docs[:5]])
85
+ # show context in streamlit with subheader
86
+ """st.subheader("Context:")
87
+ st.write(context)"""
88
+ instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
89
+
90
+ # Make an OpenAI request with the given context and query
91
+ completion = openai.ChatCompletion.create(
92
+ model="gpt-3.5-turbo", # or any other model you're targeting
93
+ messages=[
94
+ {"role": "user", "content": instruct}
95
+ ],
96
+ max_tokens=150
97
+ )
98
+
99
+ # Extract the generated answer
100
+ predicted = completion.choices[0].message["content"]
101
+
102
+ # Return the generated answer
103
+ st.subheader("Answer:")
104
+ st.write(predicted)
105
+ return predicted, context
106
+
107
+
108
+
109
+ def run():
110
+ global db
111
+ print("run app")
112
+ text = load_pdf(file_path)
113
+ # text = extract_text_with_format(file_path)
114
+ splitted_text = split_text(text, paragraph_length, overlapping_length)
115
+ print("num splitted text", len(splitted_text))
116
+ db = save_in_DB(splitted_text)
117
+ print("type db", type(db))
118
+
119
+ demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
120
+
121
+ demo.launch()
122
+ # query(db)
src/main.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from app import run
2
+
3
+ def start_app():
4
+ run()
5
+
6
+
7
+ # Press the green button in the gutter to run the script.
8
+ if __name__ == '__main__':
9
+ start_app()
src/process.py ADDED
File without changes