File size: 1,347 Bytes
fa21aad 8c553de fa21aad 563bf73 fa21aad 563bf73 fa21aad 8c553de fa21aad 8c553de fa21aad 8c553de fa21aad 8c553de fa21aad c61a55b fa21aad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import streamlit as st
import PyPDF2
import pandas as pd
from fpdf import FPDF
import os
# Function to process the PDF and categorize text
def process_pdf(file):
reader = PyPDF2.PdfReader(file)
categorized_text = []
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
if "Introduction" in text:
category = "Introduction"
elif "Methods" in text:
category = "Methods"
elif "Results" in text:
category = "Results"
elif "Discussion" in text:
category = "Discussion"
elif "Conclusion" in text:
category = "Conclusion"
else:
category = "Other"
categorized_text.append((category, text))
df = pd.DataFrame(categorized_text, columns=["Category", "Text"])
return df
# Streamlit app UI
st.title("PDF Extraction and Text Categorization")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
df = process_pdf(uploaded_file)
st.write("Categorized Text:")
st.dataframe(df)
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Categorized Text as CSV",
data=csv,
file_name='categorized_text.csv',
mime='text/csv',
)
|