Spaces:

vr18
/

legal-rag

Runtime error

App Files Files Community

vr18 commited on Oct 11, 2023

Commit

e9f0b55

1 Parent(s): 853a403

Delete src/app.py

Browse files

Files changed (1) hide show

src/app.py +0 -122

src/app.py DELETED Viewed

@@ -1,122 +0,0 @@
-from PyPDF2 import PdfReader
-# import pdfplumber
-from tqdm import tqdm
-import tiktoken
-from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
-from langchain.vectorstores import Chroma
-import openai
-import streamlit as st
-import gradio as gr
-openai.api_key = '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'
-# write some python constants for file name, paragraph length, overlapping length:
-file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
-paragraph_length = 100
-overlapping_length = 50
-db = None
-from PyPDF2 import PdfReader
-def load_pdf(file_path):
-    print("load pdf")
-    reader = PdfReader(file_path)
-    # concatenate all pages
-    text = ''
-    for page in tqdm(reader.pages):
-        text += page.extract_text()
-    return text
-def extract_text_with_format(pdf_path):
-    with pdfplumber.open(pdf_path) as pdf:
-        text = ''
-        for page in tqdm(pdf.pages):
-            text += page.extract_text()
-    return text
-from collections import deque
-def split_text(text, paragraph_length, overlapping_length):
-    enc = tiktoken.get_encoding("cl100k_base")
-    enc = tiktoken.encoding_for_model("gpt-4")
-    def get_len(tokens):
-        return len(tokens)
-    def tokens_to_text(tokens):
-        return enc.decode(tokens)
-    # split text so each item is max paragraph length and overlap is overlapping length
-    splitted_text = []
-    tokens = enc.encode(text)
-    i = 0
-    while i < len(tokens):
-        start = max(i - overlapping_length, 0)
-        end = i + paragraph_length
-        splitted_text.append(tokens_to_text(tokens[start:end]))
-        i += paragraph_length
-    return splitted_text
-def save_in_DB(splitted_text):
-    # Create the open-source embedding function
-    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    db = Chroma.from_texts(splitted_text, embedding_function)
-    print("Data saved successfully!")
-    print("type db", type(db))
-    return db
-def query(query_text):
-    st.title('RAG system')
-    # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
-    docs = db.similarity_search(query_text)
-    print("len(docs)", len(docs))
-    # Store the first 10 results as context
-    context = '\n\n'.join([doc.page_content for doc in docs[:5]])
-    # show context in streamlit with subheader
-    """st.subheader("Context:")
-    st.write(context)"""
-    instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
-    # Make an OpenAI request with the given context and query
-    completion = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",  # or any other model you're targeting
-        messages=[
-            {"role": "user", "content": instruct}
-            ],
-        max_tokens=150
-        )
-    # Extract the generated answer
-    predicted = completion.choices[0].message["content"]
-    # Return the generated answer
-    st.subheader("Answer:")
-    st.write(predicted)
-    return predicted, context
-def run():
-    global db
-    print("run app")
-    text = load_pdf(file_path)
-    # text = extract_text_with_format(file_path)
-    splitted_text = split_text(text, paragraph_length, overlapping_length)
-    print("num splitted text", len(splitted_text))
-    db = save_in_DB(splitted_text)
-    print("type db", type(db))
-    demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
-    demo.launch()
-    # query(db)