# -*- coding: utf-8 -*- """Untitled60.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p """ import gradio as gr import pandas as pd from PyPDF2 import PdfReader import re import os def process_pdf(file): # Load the PDF pdf = PdfReader(file.name) data = [] # File path regex pattern file_path_pattern = re.compile(r'.*\/.*\..*$') # Set a flag to start collecting data start_collecting = False # Iterate over all pages in the PDF for page in pdf.pages: text = page.extract_text() lines = text.split('\n') # Process lines that do not contain 'AM', 'PM', or match the file path pattern for line in lines: if "Accounts that follow you in Threads" in line: start_collecting = True continue if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line): data.append(line) # Convert the data into a pandas DataFrame df = pd.DataFrame(data, columns=['Username']) # Save DataFrame to Excel output_file = "Followers_output.xlsx" df.to_excel(output_file, index=False) return output_file # Define the Gradio interface iface = gr.Interface(fn=process_pdf, inputs="file", outputs=gr.outputs.File(label="Output Excel"), title="PDF threads data to Excel", description="Hi Humood! Just Upload the PDF file of threads data and get an Excel file with usernames.") iface.launch()