AhmedEwis commited on
Commit
22d61da
1 Parent(s): 1397424

Upload untitled60.py

Browse files
Files changed (1) hide show
  1. untitled60.py +58 -0
untitled60.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled60.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p
8
+ """
9
+
10
+ import gradio as gr
11
+ import pandas as pd
12
+ from PyPDF2 import PdfReader
13
+ import re
14
+ import os
15
+
16
+ def process_pdf(file):
17
+ # Load the PDF
18
+ pdf = PdfReader(file.name)
19
+
20
+ data = []
21
+
22
+ # File path regex pattern
23
+ file_path_pattern = re.compile(r'.*\/.*\..*$')
24
+
25
+ # Set a flag to start collecting data
26
+ start_collecting = False
27
+
28
+ # Iterate over all pages in the PDF
29
+ for page in pdf.pages:
30
+ text = page.extract_text()
31
+ lines = text.split('\n')
32
+
33
+ # Process lines that do not contain 'AM', 'PM', or match the file path pattern
34
+ for line in lines:
35
+ if "Accounts that follow you in Threads" in line:
36
+ start_collecting = True
37
+ continue
38
+
39
+ if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line):
40
+ data.append(line)
41
+
42
+ # Convert the data into a pandas DataFrame
43
+ df = pd.DataFrame(data, columns=['Username'])
44
+
45
+ # Save DataFrame to Excel
46
+ output_file = "output.xlsx"
47
+ df.to_excel(output_file, index=False)
48
+ return output_file
49
+
50
+
51
+ # Define the Gradio interface
52
+ iface = gr.Interface(fn=process_pdf,
53
+ inputs="file",
54
+ outputs=gr.outputs.File(label="Output Excel"),
55
+ title="PDF to Excel",
56
+ description="Upload a PDF file and get an Excel file with usernames.")
57
+
58
+ iface.launch(share = True)