Spaces:

surinala
/

FP-PDST

Sleeping

App Files Files Community

FP-PDST / app.py

surinala

Update app.py

8926b29 almost 2 years ago

raw

history blame contribute delete

5.19 kB

	import streamlit as st
	import pandas as pd
	from PyPDF2 import PdfReader
	from difflib import ndiff
	import os
	import matplotlib.pyplot as plt
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import fitz

	# Function to find similar paragraphs using cosine similarity
	def find_similar(vector_representation, all_representations, threshold=0.5):
	similarity_matrix = cosine_similarity(vector_representation, all_representations)
	similarities = similarity_matrix[0]

	# Find indices where similarity is above the threshold
	similar_indices = np.where(similarities > threshold)[0]

	return similar_indices.tolist()

	def process_pdf(doc, diff_line):
	data_geom = []
	data_word = []
	data_page = []

	for page_num, page in enumerate(doc.pages()):
	for block in page.get_text("blocks"):
	for line in block["lines"]:
	temp = []
	data_page.append(page_num)
	data_geom.append(line["bbox"])
	for word in line["words"]:
	temp.append(word["text"])
	data_word.append(" ".join(temp))

	data = {
	"halaman": data_page,
	"kalimat": data_word,
	"posisi": data_geom
	}

	df = pd.DataFrame(data)

	y1_coor = []
	y2_coor = []
	for pos in df["posisi"]:
	y1_coor.append(pos[1])
	y2_coor.append(pos[3])

	df["y1_coor"] = y1_coor
	df["y2_coor"] = y2_coor

	df = df.sort_values(by=["halaman", "y2_coor"])

	temp_kalimat = []
	merge_kalimat = []
	merge_y1 = []
	merge_y2 = []
	merge_halaman = []

	for i in range(df.shape[0]):
	if i == 0:
	temp_y2 = df["y2_coor"].iloc[i]
	temp_y1 = df["y1_coor"].iloc[i]
	temp_hal = df["halaman"].iloc[i]
	temp_kalimat.append(df["kalimat"].iloc[i])
	else:
	if abs(temp_y2 - df["y1_coor"].iloc[i]) < diff_line:
	temp_kalimat.append(df["kalimat"].iloc[i])
	temp_y2 = df["y2_coor"].iloc[i]
	else:
	merge_kalimat.append(" ".join(temp_kalimat))
	merge_y1.append(temp_y1)
	merge_y2.append(temp_y2)
	merge_halaman.append(temp_hal)

	temp_y2 = df["y2_coor"].iloc[i]
	temp_y1 = df["y1_coor"].iloc[i]
	temp_hal = df["halaman"].iloc[i]

	temp_kalimat = []
	temp_kalimat.append(df["kalimat"].iloc[i])

	data = {
	"halaman": merge_halaman,
	"kalimat": merge_kalimat,
	"y1_coor": merge_y1,
	"y2_coor": merge_y2
	}

	return pd.DataFrame(data)

	def highlight_pdf(doc, delete_df):
	for i in range(delete_df.shape[0]):
	norm_y1 = delete_df["y1_coor"].iloc[i]
	norm_y2 = delete_df["y2_coor"].iloc[i]
	page = int(delete_df["halaman"].iloc[i])

	doc[page].draw_rect(
	fitz.Rect(1, norm_y1, doc[page].rect.width, norm_y2),
	color=(0, 1, 0),
	width=2
	)


	def get_download_link(file_path, text):
	"""Generate a download link for the given file."""
	href = f'<a href="{file_path}" download="{file_path}">{text}</a>'
	return href

	# Main Streamlit app
	def main():
	st.title("PDF Paragraph Highlighter")

	# Upload PDF file
	uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])

	# Sidebar for threshold and line_diff values
	threshold = st.sidebar.number_input("Threshold", min_value=0.1, max_value=1.0, value=0.4, step=0.1)
	line_diff = st.sidebar.number_input("Line Diff Value", min_value=1.0, max_value=100.0, value=50.0, step=5.0)
	line_diff = line_diff / 10000

	if uploaded_file is not None:
	st.subheader("Uploaded PDF Preview")
	st.write(f"Filename: {uploaded_file.name}")

	# Processing PDF content
	doc = fitz.open(uploaded_file)
	res_df = process_pdf(doc, line_diff)

	# Display processed DataFrame
	st.subheader("Processed DataFrame")
	st.dataframe(res_df)

	# Search for similar paragraphs
	search_string = "deletion erasure removal purge expunge wipe clear annulment obliteration destruction"
	embeddings_distilbert = model.encode(res_df['kalimat'].values)
	search_vect = model.encode([search_string])
	distilbert_similar_indexes = find_similar(search_vect, embeddings_distilbert, threshold)
	delete_df = res_df.iloc[distilbert_similar_indexes, :]

	# Highlight paragraphs in the PDF
	highlight_pdf(doc, delete_df)

	# Save the highlighted PDF
	highlighted_path = './highlighted.pdf'
	doc.save(highlighted_path)

	# Display download link for the highlighted PDF
	st.subheader("Download Highlighted PDF")
	st.markdown(get_download_link(highlighted_path, 'Download Highlighted PDF'), unsafe_allow_html=True)

	if __name__ == "__main__":
	# Load the SentenceTransformer model
	model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

	# Run the Streamlit app
	main()