|
import streamlit as st |
|
import pandas as pd |
|
from PyPDF2 import PdfReader |
|
from difflib import ndiff |
|
import os |
|
import matplotlib.pyplot as plt |
|
from doctr.io import DocumentFile |
|
from doctr.models import ocr_predictor |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import fitz |
|
|
|
|
|
def find_similar(vector_representation, all_representations, threshold=0.5): |
|
similarity_matrix = cosine_similarity(vector_representation, all_representations) |
|
similarities = similarity_matrix[0] |
|
|
|
|
|
similar_indices = np.where(similarities > threshold)[0] |
|
|
|
return similar_indices.tolist() |
|
|
|
def process_pdf(doc, diff_line): |
|
data_geom = [] |
|
data_word = [] |
|
data_page = [] |
|
|
|
for page_num, page in enumerate(doc.pages()): |
|
for block in page.get_text("blocks"): |
|
for line in block["lines"]: |
|
temp = [] |
|
data_page.append(page_num) |
|
data_geom.append(line["bbox"]) |
|
for word in line["words"]: |
|
temp.append(word["text"]) |
|
data_word.append(" ".join(temp)) |
|
|
|
data = { |
|
"halaman": data_page, |
|
"kalimat": data_word, |
|
"posisi": data_geom |
|
} |
|
|
|
df = pd.DataFrame(data) |
|
|
|
y1_coor = [] |
|
y2_coor = [] |
|
for pos in df["posisi"]: |
|
y1_coor.append(pos[1]) |
|
y2_coor.append(pos[3]) |
|
|
|
df["y1_coor"] = y1_coor |
|
df["y2_coor"] = y2_coor |
|
|
|
df = df.sort_values(by=["halaman", "y2_coor"]) |
|
|
|
temp_kalimat = [] |
|
merge_kalimat = [] |
|
merge_y1 = [] |
|
merge_y2 = [] |
|
merge_halaman = [] |
|
|
|
for i in range(df.shape[0]): |
|
if i == 0: |
|
temp_y2 = df["y2_coor"].iloc[i] |
|
temp_y1 = df["y1_coor"].iloc[i] |
|
temp_hal = df["halaman"].iloc[i] |
|
temp_kalimat.append(df["kalimat"].iloc[i]) |
|
else: |
|
if abs(temp_y2 - df["y1_coor"].iloc[i]) < diff_line: |
|
temp_kalimat.append(df["kalimat"].iloc[i]) |
|
temp_y2 = df["y2_coor"].iloc[i] |
|
else: |
|
merge_kalimat.append(" ".join(temp_kalimat)) |
|
merge_y1.append(temp_y1) |
|
merge_y2.append(temp_y2) |
|
merge_halaman.append(temp_hal) |
|
|
|
temp_y2 = df["y2_coor"].iloc[i] |
|
temp_y1 = df["y1_coor"].iloc[i] |
|
temp_hal = df["halaman"].iloc[i] |
|
|
|
temp_kalimat = [] |
|
temp_kalimat.append(df["kalimat"].iloc[i]) |
|
|
|
data = { |
|
"halaman": merge_halaman, |
|
"kalimat": merge_kalimat, |
|
"y1_coor": merge_y1, |
|
"y2_coor": merge_y2 |
|
} |
|
|
|
return pd.DataFrame(data) |
|
|
|
def highlight_pdf(doc, delete_df): |
|
for i in range(delete_df.shape[0]): |
|
norm_y1 = delete_df["y1_coor"].iloc[i] |
|
norm_y2 = delete_df["y2_coor"].iloc[i] |
|
page = int(delete_df["halaman"].iloc[i]) |
|
|
|
doc[page].draw_rect( |
|
fitz.Rect(1, norm_y1, doc[page].rect.width, norm_y2), |
|
color=(0, 1, 0), |
|
width=2 |
|
) |
|
|
|
|
|
def get_download_link(file_path, text): |
|
"""Generate a download link for the given file.""" |
|
href = f'<a href="{file_path}" download="{file_path}">{text}</a>' |
|
return href |
|
|
|
|
|
def main(): |
|
st.title("PDF Paragraph Highlighter") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"]) |
|
|
|
|
|
threshold = st.sidebar.number_input("Threshold", min_value=0.1, max_value=1.0, value=0.4, step=0.1) |
|
line_diff = st.sidebar.number_input("Line Diff Value", min_value=1.0, max_value=100.0, value=50.0, step=5.0) |
|
line_diff = line_diff / 10000 |
|
|
|
if uploaded_file is not None: |
|
st.subheader("Uploaded PDF Preview") |
|
st.write(f"Filename: {uploaded_file.name}") |
|
|
|
|
|
doc = fitz.open(uploaded_file) |
|
res_df = process_pdf(doc, line_diff) |
|
|
|
|
|
st.subheader("Processed DataFrame") |
|
st.dataframe(res_df) |
|
|
|
|
|
search_string = "deletion erasure removal purge expunge wipe clear annulment obliteration destruction" |
|
embeddings_distilbert = model.encode(res_df['kalimat'].values) |
|
search_vect = model.encode([search_string]) |
|
distilbert_similar_indexes = find_similar(search_vect, embeddings_distilbert, threshold) |
|
delete_df = res_df.iloc[distilbert_similar_indexes, :] |
|
|
|
|
|
highlight_pdf(doc, delete_df) |
|
|
|
|
|
highlighted_path = './highlighted.pdf' |
|
doc.save(highlighted_path) |
|
|
|
|
|
st.subheader("Download Highlighted PDF") |
|
st.markdown(get_download_link(highlighted_path, 'Download Highlighted PDF'), unsafe_allow_html=True) |
|
|
|
if __name__ == "__main__": |
|
|
|
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') |
|
|
|
|
|
main() |
|
|