pdfextract1 / app.py
jithenderchoudary's picture
Update app.py
fc6645c verified
import gradio as gr
import pdfplumber
import pandas as pd
import os
# Define the path for input and output
input_file_path = '/mnt/data/extracted_data (1).xlsx'
output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
# Ensure the directory exists
if not os.path.exists('/mnt/data/'):
os.makedirs('/mnt/data/')
# Check if the file exists
if os.path.exists(input_file_path):
# Read the Excel file if it exists
df = pd.read_excel(input_file_path)
print("File loaded successfully.")
else:
print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
# Check if directory exists and list files
directory_path = "/mnt/data/"
if os.path.exists(directory_path):
files = os.listdir(directory_path)
print("Files in /mnt/data/:", files)
else:
os.makedirs(directory_path)
print("Directory created:", directory_path)
# Define the function to extract data from PDF
def extract_data(pdf_file_path, start_pos, end_pos):
try:
# Load and process the PDF
with pdfplumber.open(pdf_file_path) as pdf:
data = []
for page in pdf.pages:
text = page.extract_text()
if text is None:
return "Error: Could not extract text from the PDF. Please check the file format."
print("Extracted Text:", text) # Debugging line
# Example extracted data structure
extracted_data = {
"Pos": [10, 20, 30],
"Item Code": ["155569003011", "155569003012", "155569003013"],
"Quantity": [10, 10, 10],
"Basic Price": [57.66, 57.66, 57.66],
"Sub Total": [576.60, 576.60, 576.60]
}
# Convert to DataFrame and save to Excel
df = pd.DataFrame(extracted_data)
output_path = "/mnt/data/extracted_data.xlsx"
df.to_excel(output_path, index=False)
if os.path.exists(output_path):
print("File saved successfully:", output_path)
return output_path
else:
return "Error: Failed to save the Excel file."
except Exception as e:
print("Error encountered:", str(e))
return f"Error: {e}"
# Set up Gradio interface
interface = gr.Interface(
fn=extract_data,
inputs=[
gr.File(type="filepath", label="Upload PDF File"),
gr.Number(value=10, label="Start Position"),
gr.Number(value=450, label="End Position")
],
outputs=gr.File(label="Download Extracted Excel")
)
# Additional Excel filtering logic
if os.path.exists(input_file_path):
df = pd.read_excel(input_file_path)
# Filter for positions between 10 and 450
filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
filtered_df.to_excel(output_file_path, index=False)
print(f"Filtered data saved to: {output_file_path}")
else:
print(f"Input file not found: {input_file_path}. Skipping filtering.")
interface.launch()