Spaces:

tlarsson
/

psdocuments

Sleeping

Tomas Larsson

update

52cc340 almost 2 years ago

6.72 kB


	import streamlit as st
	st.session_state.em = "0"

	import os
	import json


	import requests
	from langchain.document_loaders import TextLoader
	from langchain.text_splitter import CharacterTextSplitter
	import re

	import os
	import numpy as np

	st.set_page_config(layout="wide")


	# Path to the image
	image_path = 'fire.jpg'

	# Display the image with st.image
	st.image(image_path, caption='', use_column_width=True)


	started = 'docs' in st.session_state
	exec(open('start2.py').read())



	os.environ["OPENAI_API_KEY"] = os.getenv('openkey')

	def extract_text_from_pdf(pdf_path):
	# Open the provided PDF file
	doc = fitz.open(pdf_path)

	# Initialize a text variable to store all the text
	text = ""

	# Iterate through each page in the PDF
	for page_num in range(len(doc)):
	# Get a page
	page = doc.load_page(page_num)

	# Extract text from the page and add it to the text variable
	text += page.get_text()

	# Close the document
	doc.close()

	return text

	def extract_text_from_pdf2(PDFfile):
	#import the PyPDF2 module
	import PyPDF2

	#open the PDF file
	PDFfile = open('pc.pdf', 'rb')

	PDFfilereader = PyPDF2.PdfReader(PDFfile)

	#print the number of pages
	print(PDFfilereader.pages)

	#provide the page number

	txt = ''

	for pages in PDFfilereader.pages:
	#extracting the text in PDF file
	txt = txt + pages.extract_text()

	#close the PDF file
	PDFfile.close()

	return txt


	def strip_repeated_dots_and_blanks(text):
	# Replace multiple dots with a single dot
	text = re.sub(r'\.{2,}', '.', text)
	# Replace multiple spaces with a single space
	text = re.sub(r' {2,}', ' ', text)
	text = re.sub('\n \n', '\n\n', text)

	return text




	# Title of the page
	st.title('Peerstreet Question and Answer App')

	# Text input for the question
	question = st.text_input("Type your question here:")

	# A button to submit the question
	submit_button = st.button('Submit')

	st.markdown("For best results keep questions simple and to the point and use words that are likely to be found in the documents")
	st.markdown(""" Sample Questions:

	* When is the voting deadline?
	* What is the expected recovery for MPDN's?

	""")

	# Create tabs
	Answer_tab, Content_tab, Info_tab = st.tabs(["Answer", "Content used to create answer", "Infrmation about this app"])


	# Placeholder for displaying the answer
	with Answer_tab:
	answer_placeholder = st.empty()
	with Content_tab:
	content_placeholder = st.empty()
	with Info_tab:
	st.markdown("""## Use at your own risk, accuracy of responses are not guaranteed.

	This app base its anwsers on 110 documents filed by the court. This does not include any scanned documents at this point
	as it takes more work to retrieve the text from them. It does include most orders filed by the court up to Feb 29th.


	This is a simple RAG (retrieval augmented generation) system and does not consider order of events when
	retrieving onformation and generating responses. It can also easily missinterpret information, but information used to generate the
	response is presented in the content tab with link to the full document so that you can read the details in its proper context.


	""" )



	with open('results.json', 'r') as file:
	content = file.read()


	data_to_download = content.encode()

	# Create a download button
	st.download_button(label="Download Prior responses",
	data=data_to_download,
	file_name="results.json",
	mime="json")



	# Logic to display an answer when the submit button is pressed
	if submit_button:
	if question: # Check if there is a question typed
	# Process the question here (a placeholder answer is used in this example)
	try:
	if started:
	#Awnser = rag_chain.invoke(question)
	#contexts = retriever.get_relevant_documents(question)
	answer, selected_items,selected_sources,titles,dates,selected_chunks,highest_simularities = ask(question)
	answer_placeholder.markdown(escape_markdown(answer)) # Display the answer

	# Prepare the data to be saved


	data_to_save = {
	"query": question,
	"answer": answer,
	"selected_items": selected_items,
	"selected_sources": selected_sources,
	"selected_chunks": selected_chunks,
	"highest_similarities": [f"{sim:.2f}" for sim in highest_simularities]
	}

	# The file to which the data will be appended
	file_path = 'results.json'

	try:
	# Read the existing content of the file
	with open(file_path, 'r') as file:
	existing_data = json.load(file)
	except (FileNotFoundError, json.JSONDecodeError):
	# If the file doesn't exist or is empty, start with an empty list
	existing_data = []

	# Append the new data
	existing_data.append(data_to_save)

	# Write the updated data back to the file
	with open(file_path, 'w') as file:
	json.dump(existing_data, file, indent=4)




	url = 'https://cases.stretto.com/public/x247/12208/PLEADINGS/'





	string = ""
	for k in range(len(selected_items)):
	temp = " [" + titles[k] + "](" + url + selected_sources[k] + ")" + " text block: " + selected_chunks[k] + " Relevance: " +f"{highest_simularities[k]:.2f}" + " Date:" + dates[k]


	string = string + "### Paragraph used. \n" + escape_markdown(selected_items[k]) + "\n\n source:" + temp + "\n"


	content_placeholder.markdown(string)



	else:
	answer_placeholder.markdown("Waiting for system to wake up "+ st.session_state.ln + " " + st.session_state.em )
	except Exception as e:
	answer_placeholder.markdown(e) # Display the answer

	else:
	answer_placeholder.warning("Please type a question.")




	#if 'retriever' not in st.session_state:
	# st.session_state.em = "mm"

	#if 'retriever' not in st.session_state:
	# st.session_state.em = "1"
	# exec(open('start.py').read())
	# st.session_state.em = "2"