Spaces:

jithenderchoudary
/

pdfextract1

Sleeping

App Files Files Community

pdfextract1 / app.py

jithenderchoudary

Update app.py

fc6645c verified over 1 year ago

raw

history blame contribute delete

3.05 kB

	import gradio as gr
	import pdfplumber
	import pandas as pd
	import os

	# Define the path for input and output
	input_file_path = '/mnt/data/extracted_data (1).xlsx'
	output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'

	# Ensure the directory exists
	if not os.path.exists('/mnt/data/'):
	os.makedirs('/mnt/data/')

	# Check if the file exists
	if os.path.exists(input_file_path):
	# Read the Excel file if it exists
	df = pd.read_excel(input_file_path)
	print("File loaded successfully.")
	else:
	print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")

	# Check if directory exists and list files
	directory_path = "/mnt/data/"
	if os.path.exists(directory_path):
	files = os.listdir(directory_path)
	print("Files in /mnt/data/:", files)
	else:
	os.makedirs(directory_path)
	print("Directory created:", directory_path)

	# Define the function to extract data from PDF
	def extract_data(pdf_file_path, start_pos, end_pos):
	try:
	# Load and process the PDF
	with pdfplumber.open(pdf_file_path) as pdf:
	data = []
	for page in pdf.pages:
	text = page.extract_text()
	if text is None:
	return "Error: Could not extract text from the PDF. Please check the file format."

	print("Extracted Text:", text) # Debugging line

	# Example extracted data structure
	extracted_data = {
	"Pos": [10, 20, 30],
	"Item Code": ["155569003011", "155569003012", "155569003013"],
	"Quantity": [10, 10, 10],
	"Basic Price": [57.66, 57.66, 57.66],
	"Sub Total": [576.60, 576.60, 576.60]
	}

	# Convert to DataFrame and save to Excel
	df = pd.DataFrame(extracted_data)
	output_path = "/mnt/data/extracted_data.xlsx"
	df.to_excel(output_path, index=False)

	if os.path.exists(output_path):
	print("File saved successfully:", output_path)
	return output_path
	else:
	return "Error: Failed to save the Excel file."

	except Exception as e:
	print("Error encountered:", str(e))
	return f"Error: {e}"

	# Set up Gradio interface
	interface = gr.Interface(
	fn=extract_data,
	inputs=[
	gr.File(type="filepath", label="Upload PDF File"),
	gr.Number(value=10, label="Start Position"),
	gr.Number(value=450, label="End Position")
	],
	outputs=gr.File(label="Download Extracted Excel")
	)

	# Additional Excel filtering logic
	if os.path.exists(input_file_path):
	df = pd.read_excel(input_file_path)
	# Filter for positions between 10 and 450
	filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
	filtered_df.to_excel(output_file_path, index=False)
	print(f"Filtered data saved to: {output_file_path}")
	else:
	print(f"Input file not found: {input_file_path}. Skipping filtering.")

	interface.launch()