Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import PyPDF2 | |
import pandas as pd | |
from openpyxl import load_workbook | |
import os | |
import time | |
import requests | |
from io import BytesIO | |
from openpyxl.styles import Border, Side,Alignment | |
department='Default ' | |
sem='Default ' | |
section='Default' | |
def convert_pdf_to_excel(pdf_file): | |
rows = [] | |
firstcolumn='' | |
# Function to extract text from a PDF and process it into a tabular format | |
def extract_data(pdf_file,pdf_name_withoutextension): | |
# Open the PDF file | |
with open(pdf_file, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page_num in range(len(reader.pages)): | |
page = reader.pages[page_num] | |
text = page.extract_text() | |
# Split the text into lines and process each line | |
for line in text.split('\n'): | |
rows.append(line) | |
sublists = [] | |
i = 3 # Index to track position in the list | |
space=False | |
while i < len(rows): | |
# Check if the current value is "Department Name" | |
if rows[i] == 'Department Name': | |
# print(rows[i]) | |
# print(rows[i+1]) | |
# print(rows[i+2]) | |
# print(rows[i+3]) | |
# print(rows[i+4]) | |
# print(rows[i+5]) | |
# print(rows[i+6]) | |
# print(rows[i+7]) | |
# print(rows[i+8]) | |
# print(rows[i+9]) | |
# break | |
# Skip next 6 values (including the current one) | |
print(rows[i]) | |
department=rows[i+1] | |
department=department.strip() | |
print("Dept is ",department) | |
sem=rows[i+2] | |
if(sem==' '): | |
space=True | |
sem=rows[i+3] | |
print("sem is ",sem) | |
section=rows[i+5] | |
if(section=='Semester'): | |
section=rows[i+6] | |
print("section is ",section) | |
i += 7 | |
if(space==True): | |
i=i+1 | |
# print(department) | |
# print(sem) | |
# print(section) | |
else: | |
# Extract a sublist of next 3 elements | |
print('val',rows[i]) | |
sublist = rows[i:i + 3] | |
sublists.append(sublist) | |
i += 3 # Move the index to the next set of 3 elements | |
df = pd.DataFrame(sublists) | |
firstcolumn='Student Name '+department+' '+sem+' '+section | |
firstcolumn=firstcolumn.upper() | |
df.to_excel(pdf_name_withoutextension,header=[firstcolumn,'USN','PASSWORD'], index=False) | |
return rows,firstcolumn | |
# Path to your PDF file | |
pdf_file_name= pdf_file | |
pdf_name_withoutextension=pdf_file_name.split('.')[0]+'.xlsx' | |
# Extract table data | |
table_data,firstcolumn = extract_data(pdf_file_name,pdf_name_withoutextension) | |
file_path =pdf_name_withoutextension # Replace with your file path | |
df = pd.read_excel(file_path, engine='openpyxl') | |
# Add a Serial Number column | |
df.insert(0, 'Sl. No.', range(1, 1 + len(df))) | |
# Save the modified DataFrame back to Excel | |
df.to_excel(file_path, index=False) | |
col1 = firstcolumn # Replace with the name of the first column | |
col2 = 'USN' # Replace with the name of the second column | |
# Swap the columns | |
df[col1], df[col2] = df[col2].copy(), df[col1].copy() | |
df.to_excel(file_path, index=False) | |
file_path = pdf_name_withoutextension # Replace with your file path | |
workbook = load_workbook(file_path) | |
# Select the active worksheet | |
worksheet = workbook.active | |
align_center = Alignment(horizontal='center', vertical='center') | |
for col in ['A', 'B', 'C', 'D']: | |
for row in range(1, worksheet.max_row + 1): | |
cell = worksheet[f'{col}{row}'] | |
cell.alignment = align_center | |
thin_border = Border(left=Side(style='thin'), | |
right=Side(style='thin'), | |
top=Side(style='thin'), | |
bottom=Side(style='thin')) | |
# Apply the border to each cell | |
for row in worksheet.iter_rows(): | |
for cell in row: | |
cell.border = thin_border | |
# Set the height of each row | |
for row in worksheet.iter_rows(): | |
worksheet.row_dimensions[row[0].row].height = 25 | |
workbook.save(pdf_name_withoutextension) | |
file_path = pdf_name_withoutextension # Replace with your file path | |
def get_file_content_as_bytes(path): | |
with open(path, "rb") as file: | |
return file.read() | |
file_content = get_file_content_as_bytes(file_path) | |
st.download_button( | |
label='Download Excel File', | |
data=file_content, | |
file_name=file_path, | |
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | |
) | |
st.title('PDF to Excel Converter') | |
uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True) | |
for uploaded_file in uploaded_files: | |
if uploaded_file is not None: | |
file_name = uploaded_file.name | |
# Define the file path (current directory in this case) | |
file_path = os.path.join(os.getcwd(), file_name) | |
# Write the file to the current directory | |
with open(file_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.success(f'File "{file_name}" saved at "{file_path}"') | |
excel_file = convert_pdf_to_excel(file_name) | |