import streamlit as st import pandas as pd from transformers import pipeline import base64 def upload_and_extract_text(): uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) if uploaded_file is not None: ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large") extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"] return extracted_text def text_to_dataframe(text): lines = text.split("\n") data = [] for line in lines: data.append([line]) df = pd.DataFrame(data, columns=["Text"]) return df def main(): st.title("PDF to Spreadsheet Converter") st.write("Upload a PDF file to extract text and save it as a spreadsheet.") extracted_text = upload_and_extract_text() if extracted_text is not None: st.write("### Extracted Text") st.write(extracted_text) df = text_to_dataframe(extracted_text) st.write("### Spreadsheet Preview") st.write(df) csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download Extracted Text as CSV' st.markdown(href, unsafe_allow_html=True) if __name__ == "__main__": main()