import streamlit as st import pandas as pd import PyPDF2 import pandas as pd from openpyxl import load_workbook import os import time import requests from io import BytesIO from openpyxl.styles import Border, Side,Alignment department='Default ' sem='Default ' section='Default' def convert_pdf_to_excel(pdf_file): rows = [] firstcolumn='' # Function to extract text from a PDF and process it into a tabular format def extract_data(pdf_file,pdf_name_withoutextension): # Open the PDF file with open(pdf_file, 'rb') as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): page = reader.pages[page_num] text = page.extract_text() # Split the text into lines and process each line for line in text.split('\n'): rows.append(line) sublists = [] i = 3 # Index to track position in the list space=False while i < len(rows): # Check if the current value is "Department Name" if rows[i] == 'Department Name': # print(rows[i]) # print(rows[i+1]) # print(rows[i+2]) # print(rows[i+3]) # print(rows[i+4]) # print(rows[i+5]) # print(rows[i+6]) # print(rows[i+7]) # print(rows[i+8]) # print(rows[i+9]) # break # Skip next 6 values (including the current one) print(rows[i]) department=rows[i+1] department=department.strip() print("Dept is ",department) sem=rows[i+2] if(sem==' '): space=True sem=rows[i+3] print("sem is ",sem) section=rows[i+5] if(section=='Semester'): section=rows[i+6] print("section is ",section) i += 7 if(space==True): i=i+1 # print(department) # print(sem) # print(section) else: # Extract a sublist of next 3 elements print('val',rows[i]) sublist = rows[i:i + 3] sublists.append(sublist) i += 3 # Move the index to the next set of 3 elements df = pd.DataFrame(sublists) firstcolumn='Student Name '+department+' '+sem+' '+section firstcolumn=firstcolumn.upper() df.to_excel(pdf_name_withoutextension,header=[firstcolumn,'USN','PASSWORD'], index=False) return rows,firstcolumn # Path to your PDF file pdf_file_name= pdf_file pdf_name_withoutextension=pdf_file_name.split('.')[0]+'.xlsx' # Extract table data table_data,firstcolumn = extract_data(pdf_file_name,pdf_name_withoutextension) file_path =pdf_name_withoutextension # Replace with your file path df = pd.read_excel(file_path, engine='openpyxl') # Add a Serial Number column df.insert(0, 'Sl. No.', range(1, 1 + len(df))) # Save the modified DataFrame back to Excel df.to_excel(file_path, index=False) col1 = firstcolumn # Replace with the name of the first column col2 = 'USN' # Replace with the name of the second column # Swap the columns df[col1], df[col2] = df[col2].copy(), df[col1].copy() df.to_excel(file_path, index=False) file_path = pdf_name_withoutextension # Replace with your file path workbook = load_workbook(file_path) # Select the active worksheet worksheet = workbook.active align_center = Alignment(horizontal='center', vertical='center') for col in ['A', 'B', 'C', 'D']: for row in range(1, worksheet.max_row + 1): cell = worksheet[f'{col}{row}'] cell.alignment = align_center thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin')) # Apply the border to each cell for row in worksheet.iter_rows(): for cell in row: cell.border = thin_border # Set the height of each row for row in worksheet.iter_rows(): worksheet.row_dimensions[row[0].row].height = 25 workbook.save(pdf_name_withoutextension) file_path = pdf_name_withoutextension # Replace with your file path def get_file_content_as_bytes(path): with open(path, "rb") as file: return file.read() file_content = get_file_content_as_bytes(file_path) st.download_button( label='Download Excel File', data=file_content, file_name=file_path, mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) st.title('PDF to Excel Converter') uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True) for uploaded_file in uploaded_files: if uploaded_file is not None: file_name = uploaded_file.name # Define the file path (current directory in this case) file_path = os.path.join(os.getcwd(), file_name) # Write the file to the current directory with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.success(f'File "{file_name}" saved at "{file_path}"') excel_file = convert_pdf_to_excel(file_name)