Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Untitled60.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p | |
""" | |
import gradio as gr | |
import pandas as pd | |
from PyPDF2 import PdfReader | |
import re | |
import os | |
def process_pdf(file): | |
# Load the PDF | |
pdf = PdfReader(file.name) | |
data = [] | |
# File path regex pattern | |
file_path_pattern = re.compile(r'.*\/.*\..*$') | |
# Set a flag to start collecting data | |
start_collecting = False | |
# Iterate over all pages in the PDF | |
for page in pdf.pages: | |
text = page.extract_text() | |
lines = text.split('\n') | |
# Process lines that do not contain 'AM', 'PM', or match the file path pattern | |
for line in lines: | |
if "Accounts that follow you in Threads" in line: | |
start_collecting = True | |
continue | |
if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line): | |
data.append(line) | |
# Convert the data into a pandas DataFrame | |
df = pd.DataFrame(data, columns=['Username']) | |
# Save DataFrame to Excel | |
output_file = "output.xlsx" | |
df.to_excel(output_file, index=False) | |
return output_file | |
# Define the Gradio interface | |
iface = gr.Interface(fn=process_pdf, | |
inputs="file", | |
outputs=gr.outputs.File(label="Output Excel"), | |
title="PDF to Excel", | |
description="Upload a PDF file and get an Excel file with usernames.") | |
iface.launch(share = True) |