File size: 5,851 Bytes
24aca4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import streamlit as st
import pandas as pd
import PyPDF2
import pandas as pd
from openpyxl import load_workbook
import os
import time
import requests
from io import BytesIO
from openpyxl.styles import Border, Side,Alignment
department='Default '
sem='Default '
section='Default'
def convert_pdf_to_excel(pdf_file):


    rows = []
    firstcolumn=''
    # Function to extract text from a PDF and process it into a tabular format
    def extract_data(pdf_file,pdf_name_withoutextension):
        # Open the PDF file
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)

            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text = page.extract_text()



                # Split the text into lines and process each line
                for line in text.split('\n'):
            
                    rows.append(line)
        
                
            sublists = []

            i = 3  # Index to track position in the list
            space=False
            while i < len(rows):
                # Check if the current value is "Department Name"
                if rows[i] == 'Department Name':
                    # print(rows[i])
                    # print(rows[i+1])
                    # print(rows[i+2])
                    # print(rows[i+3])
                    # print(rows[i+4])
                    # print(rows[i+5])
                    # print(rows[i+6])
                    # print(rows[i+7])
                    # print(rows[i+8])
                    # print(rows[i+9])
                    # break
                  # Skip next 6 values (including the current one)
                    print(rows[i])
                    department=rows[i+1]
                    department=department.strip()
                    print("Dept is ",department)
                    sem=rows[i+2]
                    if(sem==' '):
                        space=True
                        sem=rows[i+3]
                    print("sem is ",sem)
                    section=rows[i+5]  
                    if(section=='Semester'):
                        section=rows[i+6]
                    print("section is ",section)




                    i += 7
                    if(space==True):
                        i=i+1
               
                    
                    # print(department)
                    # print(sem)
                    # print(section)
                else:

                    # Extract a sublist of next 3 elements
                    print('val',rows[i])
                    sublist = rows[i:i + 3]
                    sublists.append(sublist)
                    i += 3  # Move the index to the next set of 3 elements
            
            df = pd.DataFrame(sublists)
            firstcolumn='Student Name '+department+' '+sem+'  '+section
            firstcolumn=firstcolumn.upper()
            df.to_excel(pdf_name_withoutextension,header=[firstcolumn,'USN','PASSWORD'], index=False)
            return rows,firstcolumn

    # Path to your PDF file

    pdf_file_name= pdf_file
    pdf_name_withoutextension=pdf_file_name.split('.')[0]+'.xlsx'


    # Extract table data
    table_data,firstcolumn = extract_data(pdf_file_name,pdf_name_withoutextension)


    file_path =pdf_name_withoutextension  # Replace with your file path
    df = pd.read_excel(file_path, engine='openpyxl')

    # Add a Serial Number column
    df.insert(0, 'Sl. No.', range(1, 1 + len(df)))

    # Save the modified DataFrame back to Excel
    df.to_excel(file_path, index=False)


    col1 = firstcolumn  # Replace with the name of the first column
    col2 = 'USN'  # Replace with the name of the second column

    # Swap the columns
    df[col1], df[col2] = df[col2].copy(), df[col1].copy()
    df.to_excel(file_path, index=False)


    file_path = pdf_name_withoutextension # Replace with your file path
    workbook = load_workbook(file_path)

    # Select the active worksheet
    worksheet = workbook.active
    align_center = Alignment(horizontal='center', vertical='center')
    for col in ['A', 'B', 'C', 'D']:
        for row in range(1, worksheet.max_row + 1):
            cell = worksheet[f'{col}{row}']
            cell.alignment = align_center
    thin_border = Border(left=Side(style='thin'),
                    right=Side(style='thin'),
                    top=Side(style='thin'),
                    bottom=Side(style='thin'))

    # Apply the border to each cell
    for row in worksheet.iter_rows():
        for cell in row:
            cell.border = thin_border
        # Set the height of each row
        for row in worksheet.iter_rows():
            worksheet.row_dimensions[row[0].row].height = 25
        workbook.save(pdf_name_withoutextension)



    file_path = pdf_name_withoutextension  # Replace with your file path

    def get_file_content_as_bytes(path):
        with open(path, "rb") as file:
            return file.read()

   
    file_content = get_file_content_as_bytes(file_path)
    st.download_button(
        label='Download Excel File',
        data=file_content,
        file_name=file_path,
        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    )


st.title('PDF to Excel Converter')

uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True)
for uploaded_file in uploaded_files:

    if uploaded_file is not None:
        file_name = uploaded_file.name

        # Define the file path (current directory in this case)
        file_path = os.path.join(os.getcwd(), file_name)

        # Write the file to the current directory
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        st.success(f'File "{file_name}" saved at "{file_path}"')

    excel_file = convert_pdf_to_excel(file_name)