psdocuments / app.py
Tomas Larsson
update
52cc340
import streamlit as st
st.session_state.em = "0"
import os
import json
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
import re
import os
import numpy as np
st.set_page_config(layout="wide")
# Path to the image
image_path = 'fire.jpg'
# Display the image with st.image
st.image(image_path, caption='', use_column_width=True)
started = 'docs' in st.session_state
exec(open('start2.py').read())
os.environ["OPENAI_API_KEY"] = os.getenv('openkey')
def extract_text_from_pdf(pdf_path):
# Open the provided PDF file
doc = fitz.open(pdf_path)
# Initialize a text variable to store all the text
text = ""
# Iterate through each page in the PDF
for page_num in range(len(doc)):
# Get a page
page = doc.load_page(page_num)
# Extract text from the page and add it to the text variable
text += page.get_text()
# Close the document
doc.close()
return text
def extract_text_from_pdf2(PDFfile):
#import the PyPDF2 module
import PyPDF2
#open the PDF file
PDFfile = open('pc.pdf', 'rb')
PDFfilereader = PyPDF2.PdfReader(PDFfile)
#print the number of pages
print(PDFfilereader.pages)
#provide the page number
txt = ''
for pages in PDFfilereader.pages:
#extracting the text in PDF file
txt = txt + pages.extract_text()
#close the PDF file
PDFfile.close()
return txt
def strip_repeated_dots_and_blanks(text):
# Replace multiple dots with a single dot
text = re.sub(r'\.{2,}', '.', text)
# Replace multiple spaces with a single space
text = re.sub(r' {2,}', ' ', text)
text = re.sub('\n \n', '\n\n', text)
return text
# Title of the page
st.title('Peerstreet Question and Answer App')
# Text input for the question
question = st.text_input("Type your question here:")
# A button to submit the question
submit_button = st.button('Submit')
st.markdown("For best results keep questions simple and to the point and use words that are likely to be found in the documents")
st.markdown(""" Sample Questions:
* When is the voting deadline?
* What is the expected recovery for MPDN's?
""")
# Create tabs
Answer_tab, Content_tab, Info_tab = st.tabs(["Answer", "Content used to create answer", "Infrmation about this app"])
# Placeholder for displaying the answer
with Answer_tab:
answer_placeholder = st.empty()
with Content_tab:
content_placeholder = st.empty()
with Info_tab:
st.markdown("""## Use at your own risk, accuracy of responses are not guaranteed.
This app base its anwsers on 110 documents filed by the court. This does not include any scanned documents at this point
as it takes more work to retrieve the text from them. It does include most orders filed by the court up to Feb 29th.
This is a simple RAG (retrieval augmented generation) system and does not consider order of events when
retrieving onformation and generating responses. It can also easily missinterpret information, but information used to generate the
response is presented in the content tab with link to the full document so that you can read the details in its proper context.
""" )
with open('results.json', 'r') as file:
content = file.read()
data_to_download = content.encode()
# Create a download button
st.download_button(label="Download Prior responses",
data=data_to_download,
file_name="results.json",
mime="json")
# Logic to display an answer when the submit button is pressed
if submit_button:
if question: # Check if there is a question typed
# Process the question here (a placeholder answer is used in this example)
try:
if started:
#Awnser = rag_chain.invoke(question)
#contexts = retriever.get_relevant_documents(question)
answer, selected_items,selected_sources,titles,dates,selected_chunks,highest_simularities = ask(question)
answer_placeholder.markdown(escape_markdown(answer)) # Display the answer
# Prepare the data to be saved
data_to_save = {
"query": question,
"answer": answer,
"selected_items": selected_items,
"selected_sources": selected_sources,
"selected_chunks": selected_chunks,
"highest_similarities": [f"{sim:.2f}" for sim in highest_simularities]
}
# The file to which the data will be appended
file_path = 'results.json'
try:
# Read the existing content of the file
with open(file_path, 'r') as file:
existing_data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
# If the file doesn't exist or is empty, start with an empty list
existing_data = []
# Append the new data
existing_data.append(data_to_save)
# Write the updated data back to the file
with open(file_path, 'w') as file:
json.dump(existing_data, file, indent=4)
url = 'https://cases.stretto.com/public/x247/12208/PLEADINGS/'
string = ""
for k in range(len(selected_items)):
temp = " [" + titles[k] + "](" + url + selected_sources[k] + ")" + " text block: " + selected_chunks[k] + " Relevance: " +f"{highest_simularities[k]:.2f}" + " Date:" + dates[k]
string = string + "### Paragraph used. \n" + escape_markdown(selected_items[k]) + "\n\n source:" + temp + "\n"
content_placeholder.markdown(string)
else:
answer_placeholder.markdown("Waiting for system to wake up "+ st.session_state.ln + " " + st.session_state.em )
except Exception as e:
answer_placeholder.markdown(e) # Display the answer
else:
answer_placeholder.warning("Please type a question.")
#if 'retriever' not in st.session_state:
# st.session_state.em = "mm"
#if 'retriever' not in st.session_state:
# st.session_state.em = "1"
# exec(open('start.py').read())
# st.session_state.em = "2"