AhmedEwis's picture
Update app.py
79742e4
# -*- coding: utf-8 -*-
"""Untitled60.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p
"""
import gradio as gr
import pandas as pd
from PyPDF2 import PdfReader
import re
import os
def process_pdf(file):
# Load the PDF
pdf = PdfReader(file.name)
data = []
# File path regex pattern
file_path_pattern = re.compile(r'.*\/.*\..*$')
# Set a flag to start collecting data
start_collecting = False
# Iterate over all pages in the PDF
for page in pdf.pages:
text = page.extract_text()
lines = text.split('\n')
# Process lines that do not contain 'AM', 'PM', or match the file path pattern
for line in lines:
if "Accounts that follow you in Threads" in line:
start_collecting = True
continue
if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line):
data.append(line)
# Convert the data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Username'])
# Save DataFrame to Excel
output_file = "Followers_output.xlsx"
df.to_excel(output_file, index=False)
return output_file
# Define the Gradio interface
iface = gr.Interface(fn=process_pdf,
inputs="file",
outputs=gr.outputs.File(label="Output Excel"),
title="PDF threads data to Excel",
description="Hi Humood! Just Upload the PDF file of threads data and get an Excel file with usernames.")
iface.launch()