PDFToExcel / app.py
AjitSHegde's picture
Upload 4 files
24aca4f verified
import streamlit as st
import pandas as pd
import PyPDF2
import pandas as pd
from openpyxl import load_workbook
import os
import time
import requests
from io import BytesIO
from openpyxl.styles import Border, Side,Alignment
department='Default '
sem='Default '
section='Default'
def convert_pdf_to_excel(pdf_file):
rows = []
firstcolumn=''
# Function to extract text from a PDF and process it into a tabular format
def extract_data(pdf_file,pdf_name_withoutextension):
# Open the PDF file
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
# Split the text into lines and process each line
for line in text.split('\n'):
rows.append(line)
sublists = []
i = 3 # Index to track position in the list
space=False
while i < len(rows):
# Check if the current value is "Department Name"
if rows[i] == 'Department Name':
# print(rows[i])
# print(rows[i+1])
# print(rows[i+2])
# print(rows[i+3])
# print(rows[i+4])
# print(rows[i+5])
# print(rows[i+6])
# print(rows[i+7])
# print(rows[i+8])
# print(rows[i+9])
# break
# Skip next 6 values (including the current one)
print(rows[i])
department=rows[i+1]
department=department.strip()
print("Dept is ",department)
sem=rows[i+2]
if(sem==' '):
space=True
sem=rows[i+3]
print("sem is ",sem)
section=rows[i+5]
if(section=='Semester'):
section=rows[i+6]
print("section is ",section)
i += 7
if(space==True):
i=i+1
# print(department)
# print(sem)
# print(section)
else:
# Extract a sublist of next 3 elements
print('val',rows[i])
sublist = rows[i:i + 3]
sublists.append(sublist)
i += 3 # Move the index to the next set of 3 elements
df = pd.DataFrame(sublists)
firstcolumn='Student Name '+department+' '+sem+' '+section
firstcolumn=firstcolumn.upper()
df.to_excel(pdf_name_withoutextension,header=[firstcolumn,'USN','PASSWORD'], index=False)
return rows,firstcolumn
# Path to your PDF file
pdf_file_name= pdf_file
pdf_name_withoutextension=pdf_file_name.split('.')[0]+'.xlsx'
# Extract table data
table_data,firstcolumn = extract_data(pdf_file_name,pdf_name_withoutextension)
file_path =pdf_name_withoutextension # Replace with your file path
df = pd.read_excel(file_path, engine='openpyxl')
# Add a Serial Number column
df.insert(0, 'Sl. No.', range(1, 1 + len(df)))
# Save the modified DataFrame back to Excel
df.to_excel(file_path, index=False)
col1 = firstcolumn # Replace with the name of the first column
col2 = 'USN' # Replace with the name of the second column
# Swap the columns
df[col1], df[col2] = df[col2].copy(), df[col1].copy()
df.to_excel(file_path, index=False)
file_path = pdf_name_withoutextension # Replace with your file path
workbook = load_workbook(file_path)
# Select the active worksheet
worksheet = workbook.active
align_center = Alignment(horizontal='center', vertical='center')
for col in ['A', 'B', 'C', 'D']:
for row in range(1, worksheet.max_row + 1):
cell = worksheet[f'{col}{row}']
cell.alignment = align_center
thin_border = Border(left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'))
# Apply the border to each cell
for row in worksheet.iter_rows():
for cell in row:
cell.border = thin_border
# Set the height of each row
for row in worksheet.iter_rows():
worksheet.row_dimensions[row[0].row].height = 25
workbook.save(pdf_name_withoutextension)
file_path = pdf_name_withoutextension # Replace with your file path
def get_file_content_as_bytes(path):
with open(path, "rb") as file:
return file.read()
file_content = get_file_content_as_bytes(file_path)
st.download_button(
label='Download Excel File',
data=file_content,
file_name=file_path,
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
st.title('PDF to Excel Converter')
uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True)
for uploaded_file in uploaded_files:
if uploaded_file is not None:
file_name = uploaded_file.name
# Define the file path (current directory in this case)
file_path = os.path.join(os.getcwd(), file_name)
# Write the file to the current directory
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f'File "{file_name}" saved at "{file_path}"')
excel_file = convert_pdf_to_excel(file_name)