import gradio as gr import pandas as pd import tabula from PyPDF2 import PdfReader import re import numpy as np import os import tempfile import shutil def pdf_processing(pdf_file): def extract_tables_from_pdf(pdf_file): # Open the PDF file in read-binary mode with open(pdf_file.name, 'rb') as file: # Create a PDF reader object pdf_reader = PdfReader(file) # Initialize a list to store the extracted tables tables = [] # Iterate over each page in the PDF for page_number in range(len(pdf_reader.pages)): # Extract the page as a DataFrame using tabula-py df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True) # Append the extracted DataFrame to the tables list tables.append(df) return tables def extract_substring(input_string): pattern = r'(\d{3})\.[\w]{3}' match = re.search(pattern, input_string) return match.group() if match else '' def extract_information(input_string): # Define regex patterns for extracting information lva_pattern = r'^(.*?)\r' studium_pattern = r'\r(.*?)(?=\d)' lva_nr_pattern = r'(\d{3})\.\d{3}' pruefer_pattern = r'(?<=\d)\r(.*)$' # Extract LVA lva = re.search(lva_pattern, input_string, re.DOTALL) lva = lva.group(1).strip() if lva else '' # Extract LVA Nr. lva_nr = extract_substring(input_string) # Extract Studium i = input_string.index(lva_nr) studium = input_string[len(lva):i][1:] # Extract Prüfer'in j = i + len(lva_nr) pruefer = input_string[j:] return { 'LVA': lva, 'Studium': studium, 'LVA Nr.': lva_nr, 'Prüfer\'in': pruefer } # Example usage extracted_tables = extract_tables_from_pdf(pdf_file) # Create a new DataFrame new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']) for j in range(len(extracted_tables)): # Data to append try: data = extracted_tables[j][1] except: print('Process finished!') # Create a DataFrame from the data data_df = pd.DataFrame(data)#, columns=new_df.columns) # Concatenate the new DataFrame with the data DataFrame new_df = pd.concat([new_df, data_df], ignore_index=True) # New header row new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'] # Save the 0-th row data_df.loc[-1] = data_df.columns data_df.index = data_df.index + 1 data_df.sort_index(inplace=True) # Assign the new header row data_df.columns = new_header # Define the index position to insert the new columns insert_index = 1 # Add empty columns at the specified index data_df.insert(insert_index, 'LVA-Nr.', '') data_df.insert(insert_index, 'Studium', '') data_df.insert(insert_index, 'Prüfer*in', '') for i in range(len(data_df)): input_string = data_df['LVA'][i] information = extract_information(input_string) data_df['LVA'][i] = information['LVA'] data_df['Studium'][i] = information['Studium'] data_df['LVA-Nr.'][i] = information['LVA Nr.'] data_df['Prüfer*in'][i] = information['Prüfer\'in'] final_df = pd.concat([final_df, data_df], ignore_index=True) # Remove \r from a column final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ') # Convert the data type of 'ECTS' column to float final_df['ECTS'] = final_df['ECTS'].astype(float) # Drop duplicates final_df = final_df.drop_duplicates() # Sum the values in 'ECTS' column column_sum = final_df['ECTS'].sum() # Calculate weighted mean vector = final_df['Beurteilung'] numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector)) ects = list(final_df['ECTS']) wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum # Save the CSV file to the working directory csv_save_path = os.path.join(os.getcwd(), "sample.csv") final_df.to_csv(csv_save_path, index=False) return column_sum, np.round(wm, 2), final_df, csv_save_path # Define the Gradio interface inputs = gr.inputs.File(label="Select PDF file", type="file") #inputs = gr.inputs.Textbox(label="Enter the PDF file path:") outputs = [ gr.outputs.Textbox(label="Total ECTS"), gr.outputs.Textbox(label="Weighted Mean"), gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True), gr.outputs.File(), ] iface = gr.Interface( fn=pdf_processing, inputs=inputs, outputs=outputs, title="KUSSS Buddy", ) # Launch the Gradio interface iface.launch(share=False)