Spaces:
Runtime error
Runtime error
File size: 5,851 Bytes
24aca4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import streamlit as st
import pandas as pd
import PyPDF2
import pandas as pd
from openpyxl import load_workbook
import os
import time
import requests
from io import BytesIO
from openpyxl.styles import Border, Side,Alignment
department='Default '
sem='Default '
section='Default'
def convert_pdf_to_excel(pdf_file):
rows = []
firstcolumn=''
# Function to extract text from a PDF and process it into a tabular format
def extract_data(pdf_file,pdf_name_withoutextension):
# Open the PDF file
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
# Split the text into lines and process each line
for line in text.split('\n'):
rows.append(line)
sublists = []
i = 3 # Index to track position in the list
space=False
while i < len(rows):
# Check if the current value is "Department Name"
if rows[i] == 'Department Name':
# print(rows[i])
# print(rows[i+1])
# print(rows[i+2])
# print(rows[i+3])
# print(rows[i+4])
# print(rows[i+5])
# print(rows[i+6])
# print(rows[i+7])
# print(rows[i+8])
# print(rows[i+9])
# break
# Skip next 6 values (including the current one)
print(rows[i])
department=rows[i+1]
department=department.strip()
print("Dept is ",department)
sem=rows[i+2]
if(sem==' '):
space=True
sem=rows[i+3]
print("sem is ",sem)
section=rows[i+5]
if(section=='Semester'):
section=rows[i+6]
print("section is ",section)
i += 7
if(space==True):
i=i+1
# print(department)
# print(sem)
# print(section)
else:
# Extract a sublist of next 3 elements
print('val',rows[i])
sublist = rows[i:i + 3]
sublists.append(sublist)
i += 3 # Move the index to the next set of 3 elements
df = pd.DataFrame(sublists)
firstcolumn='Student Name '+department+' '+sem+' '+section
firstcolumn=firstcolumn.upper()
df.to_excel(pdf_name_withoutextension,header=[firstcolumn,'USN','PASSWORD'], index=False)
return rows,firstcolumn
# Path to your PDF file
pdf_file_name= pdf_file
pdf_name_withoutextension=pdf_file_name.split('.')[0]+'.xlsx'
# Extract table data
table_data,firstcolumn = extract_data(pdf_file_name,pdf_name_withoutextension)
file_path =pdf_name_withoutextension # Replace with your file path
df = pd.read_excel(file_path, engine='openpyxl')
# Add a Serial Number column
df.insert(0, 'Sl. No.', range(1, 1 + len(df)))
# Save the modified DataFrame back to Excel
df.to_excel(file_path, index=False)
col1 = firstcolumn # Replace with the name of the first column
col2 = 'USN' # Replace with the name of the second column
# Swap the columns
df[col1], df[col2] = df[col2].copy(), df[col1].copy()
df.to_excel(file_path, index=False)
file_path = pdf_name_withoutextension # Replace with your file path
workbook = load_workbook(file_path)
# Select the active worksheet
worksheet = workbook.active
align_center = Alignment(horizontal='center', vertical='center')
for col in ['A', 'B', 'C', 'D']:
for row in range(1, worksheet.max_row + 1):
cell = worksheet[f'{col}{row}']
cell.alignment = align_center
thin_border = Border(left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'))
# Apply the border to each cell
for row in worksheet.iter_rows():
for cell in row:
cell.border = thin_border
# Set the height of each row
for row in worksheet.iter_rows():
worksheet.row_dimensions[row[0].row].height = 25
workbook.save(pdf_name_withoutextension)
file_path = pdf_name_withoutextension # Replace with your file path
def get_file_content_as_bytes(path):
with open(path, "rb") as file:
return file.read()
file_content = get_file_content_as_bytes(file_path)
st.download_button(
label='Download Excel File',
data=file_content,
file_name=file_path,
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
st.title('PDF to Excel Converter')
uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True)
for uploaded_file in uploaded_files:
if uploaded_file is not None:
file_name = uploaded_file.name
# Define the file path (current directory in this case)
file_path = os.path.join(os.getcwd(), file_name)
# Write the file to the current directory
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f'File "{file_name}" saved at "{file_path}"')
excel_file = convert_pdf_to_excel(file_name)
|