Spaces:

vr18
/

legal-rag

Runtime error

App Files Files Community

vr18 commited on Oct 11, 2023

Commit

967d0d2

•

1 Parent(s): d6568ea

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +2 -12
data/Hair-Relaxer-Master-Complaint-1.pdf +3 -0
requirements.txt +8 -0
src/app.py +122 -0
src/main.py +9 -0
src/process.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Hair-Relaxer-Master-Complaint-1.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,2 @@
----
-title: Legal Rag
-emoji: 🌖
-colorFrom: green
-colorTo: purple
-sdk: gradio
-sdk_version: 3.47.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ pip install pypdf2 tiktoken langchain openai chromadb sentence-transformers streamlit
2	+ gradio

data/Hair-Relaxer-Master-Complaint-1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e0aa019b6d9bae3d3db63a158150bc5b4a45c749564ef7ddff77c909daf6be0
+size 5619585

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pypdf2
+tiktoken
+langchain
+openai
+chromadb=0.3.29
+sentence-transformers
+streamlit
+gradio

src/app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from PyPDF2 import PdfReader
+# import pdfplumber
+from tqdm import tqdm
+import tiktoken
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain.vectorstores import Chroma
+import openai
+import streamlit as st
+import gradio as gr
+openai.api_key = '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'
+# write some python constants for file name, paragraph length, overlapping length:
+file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
+paragraph_length = 100
+overlapping_length = 50
+db = None
+from PyPDF2 import PdfReader
+def load_pdf(file_path):
+    print("load pdf")
+    reader = PdfReader(file_path)
+    # concatenate all pages
+    text = ''
+    for page in tqdm(reader.pages):
+        text += page.extract_text()
+    return text
+def extract_text_with_format(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        text = ''
+        for page in tqdm(pdf.pages):
+            text += page.extract_text()
+    return text
+from collections import deque
+def split_text(text, paragraph_length, overlapping_length):
+    enc = tiktoken.get_encoding("cl100k_base")
+    enc = tiktoken.encoding_for_model("gpt-4")
+    def get_len(tokens):
+        return len(tokens)
+    def tokens_to_text(tokens):
+        return enc.decode(tokens)
+    # split text so each item is max paragraph length and overlap is overlapping length
+    splitted_text = []
+    tokens = enc.encode(text)
+    i = 0
+    while i < len(tokens):
+        start = max(i - overlapping_length, 0)
+        end = i + paragraph_length
+        splitted_text.append(tokens_to_text(tokens[start:end]))
+        i += paragraph_length
+    return splitted_text
+def save_in_DB(splitted_text):
+    # Create the open-source embedding function
+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    db = Chroma.from_texts(splitted_text, embedding_function)
+    print("Data saved successfully!")
+    print("type db", type(db))
+    return db
+def query(query_text):
+    st.title('RAG system')
+    # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
+    docs = db.similarity_search(query_text)
+    print("len(docs)", len(docs))
+    # Store the first 10 results as context
+    context = '\n\n'.join([doc.page_content for doc in docs[:5]])
+    # show context in streamlit with subheader
+    """st.subheader("Context:")
+    st.write(context)"""
+    instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
+    # Make an OpenAI request with the given context and query
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",  # or any other model you're targeting
+        messages=[
+            {"role": "user", "content": instruct}
+            ],
+        max_tokens=150
+        )
+    # Extract the generated answer
+    predicted = completion.choices[0].message["content"]
+    # Return the generated answer
+    st.subheader("Answer:")
+    st.write(predicted)
+    return predicted, context
+def run():
+    global db
+    print("run app")
+    text = load_pdf(file_path)
+    # text = extract_text_with_format(file_path)
+    splitted_text = split_text(text, paragraph_length, overlapping_length)
+    print("num splitted text", len(splitted_text))
+    db = save_in_DB(splitted_text)
+    print("type db", type(db))
+    demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
+    demo.launch()
+    # query(db)

src/main.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from app import run
+def start_app():
+    run()
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    start_app()

src/process.py ADDED Viewed

File without changes