|
import streamlit as st |
|
import PyPDF2 |
|
import pandas as pd |
|
from fpdf import FPDF |
|
import os |
|
|
|
|
|
def process_pdf(file): |
|
reader = PyPDF2.PdfReader(file) |
|
categorized_text = [] |
|
|
|
for page_num in range(len(reader.pages)): |
|
page = reader.pages[page_num] |
|
text = page.extract_text() |
|
|
|
if "Introduction" in text: |
|
category = "Introduction" |
|
elif "Methods" in text: |
|
category = "Methods" |
|
elif "Results" in text: |
|
category = "Results" |
|
elif "Discussion" in text: |
|
category = "Discussion" |
|
elif "Conclusion" in text: |
|
category = "Conclusion" |
|
else: |
|
category = "Other" |
|
|
|
categorized_text.append((category, text)) |
|
|
|
df = pd.DataFrame(categorized_text, columns=["Category", "Text"]) |
|
return df |
|
|
|
|
|
st.title("PDF Extraction and Text Categorization") |
|
|
|
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") |
|
|
|
if uploaded_file is not None: |
|
df = process_pdf(uploaded_file) |
|
st.write("Categorized Text:") |
|
st.dataframe(df) |
|
|
|
csv = df.to_csv(index=False).encode('utf-8') |
|
st.download_button( |
|
label="Download Categorized Text as CSV", |
|
data=csv, |
|
file_name='categorized_text.csv', |
|
mime='text/csv', |
|
) |
|
|