# -*- coding: utf-8 -*-
"""Untitled60.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p
"""

import gradio as gr
import pandas as pd
from PyPDF2 import PdfReader
import re
import os

def process_pdf(file):
    # Load the PDF
    pdf = PdfReader(file.name)

    data = []

    # File path regex pattern
    file_path_pattern = re.compile(r'.*\/.*\..*$')

    # Set a flag to start collecting data
    start_collecting = False

    # Iterate over all pages in the PDF
    for page in pdf.pages:
        text = page.extract_text()
        lines = text.split('\n')

        # Process lines that do not contain 'AM', 'PM', or match the file path pattern
        for line in lines:
            if "Accounts that follow you in Threads" in line:
                start_collecting = True
                continue

            if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line):
                data.append(line)

    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(data, columns=['Username'])

    # Save DataFrame to Excel
    output_file = "Followers_output.xlsx"
    df.to_excel(output_file, index=False)
    return output_file


# Define the Gradio interface
iface = gr.Interface(fn=process_pdf,
                     inputs="file",
                     outputs=gr.outputs.File(label="Output Excel"),
                     title="PDF threads data to Excel",
                     description="Hi Humood! Just Upload the PDF file of threads data and get an Excel file with usernames.")

iface.launch()