FP-PDST / app.py
surinala's picture
Update app.py
8926b29
raw
history blame contribute delete
5.19 kB
import streamlit as st
import pandas as pd
from PyPDF2 import PdfReader
from difflib import ndiff
import os
import matplotlib.pyplot as plt
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import fitz
# Function to find similar paragraphs using cosine similarity
def find_similar(vector_representation, all_representations, threshold=0.5):
similarity_matrix = cosine_similarity(vector_representation, all_representations)
similarities = similarity_matrix[0]
# Find indices where similarity is above the threshold
similar_indices = np.where(similarities > threshold)[0]
return similar_indices.tolist()
def process_pdf(doc, diff_line):
data_geom = []
data_word = []
data_page = []
for page_num, page in enumerate(doc.pages()):
for block in page.get_text("blocks"):
for line in block["lines"]:
temp = []
data_page.append(page_num)
data_geom.append(line["bbox"])
for word in line["words"]:
temp.append(word["text"])
data_word.append(" ".join(temp))
data = {
"halaman": data_page,
"kalimat": data_word,
"posisi": data_geom
}
df = pd.DataFrame(data)
y1_coor = []
y2_coor = []
for pos in df["posisi"]:
y1_coor.append(pos[1])
y2_coor.append(pos[3])
df["y1_coor"] = y1_coor
df["y2_coor"] = y2_coor
df = df.sort_values(by=["halaman", "y2_coor"])
temp_kalimat = []
merge_kalimat = []
merge_y1 = []
merge_y2 = []
merge_halaman = []
for i in range(df.shape[0]):
if i == 0:
temp_y2 = df["y2_coor"].iloc[i]
temp_y1 = df["y1_coor"].iloc[i]
temp_hal = df["halaman"].iloc[i]
temp_kalimat.append(df["kalimat"].iloc[i])
else:
if abs(temp_y2 - df["y1_coor"].iloc[i]) < diff_line:
temp_kalimat.append(df["kalimat"].iloc[i])
temp_y2 = df["y2_coor"].iloc[i]
else:
merge_kalimat.append(" ".join(temp_kalimat))
merge_y1.append(temp_y1)
merge_y2.append(temp_y2)
merge_halaman.append(temp_hal)
temp_y2 = df["y2_coor"].iloc[i]
temp_y1 = df["y1_coor"].iloc[i]
temp_hal = df["halaman"].iloc[i]
temp_kalimat = []
temp_kalimat.append(df["kalimat"].iloc[i])
data = {
"halaman": merge_halaman,
"kalimat": merge_kalimat,
"y1_coor": merge_y1,
"y2_coor": merge_y2
}
return pd.DataFrame(data)
def highlight_pdf(doc, delete_df):
for i in range(delete_df.shape[0]):
norm_y1 = delete_df["y1_coor"].iloc[i]
norm_y2 = delete_df["y2_coor"].iloc[i]
page = int(delete_df["halaman"].iloc[i])
doc[page].draw_rect(
fitz.Rect(1, norm_y1, doc[page].rect.width, norm_y2),
color=(0, 1, 0),
width=2
)
def get_download_link(file_path, text):
"""Generate a download link for the given file."""
href = f'<a href="{file_path}" download="{file_path}">{text}</a>'
return href
# Main Streamlit app
def main():
st.title("PDF Paragraph Highlighter")
# Upload PDF file
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
# Sidebar for threshold and line_diff values
threshold = st.sidebar.number_input("Threshold", min_value=0.1, max_value=1.0, value=0.4, step=0.1)
line_diff = st.sidebar.number_input("Line Diff Value", min_value=1.0, max_value=100.0, value=50.0, step=5.0)
line_diff = line_diff / 10000
if uploaded_file is not None:
st.subheader("Uploaded PDF Preview")
st.write(f"Filename: {uploaded_file.name}")
# Processing PDF content
doc = fitz.open(uploaded_file)
res_df = process_pdf(doc, line_diff)
# Display processed DataFrame
st.subheader("Processed DataFrame")
st.dataframe(res_df)
# Search for similar paragraphs
search_string = "deletion erasure removal purge expunge wipe clear annulment obliteration destruction"
embeddings_distilbert = model.encode(res_df['kalimat'].values)
search_vect = model.encode([search_string])
distilbert_similar_indexes = find_similar(search_vect, embeddings_distilbert, threshold)
delete_df = res_df.iloc[distilbert_similar_indexes, :]
# Highlight paragraphs in the PDF
highlight_pdf(doc, delete_df)
# Save the highlighted PDF
highlighted_path = './highlighted.pdf'
doc.save(highlighted_path)
# Display download link for the highlighted PDF
st.subheader("Download Highlighted PDF")
st.markdown(get_download_link(highlighted_path, 'Download Highlighted PDF'), unsafe_allow_html=True)
if __name__ == "__main__":
# Load the SentenceTransformer model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Run the Streamlit app
main()