Spaces:

krishnavadithya
/

invoice_processing

Sleeping

App Files Files Community

invoice_processing / src /file_processor.py

krishnavadithya

Upload 6 files

a36a2f6 verified 3 days ago

raw

history blame contribute delete

2.97 kB

	import os
	from typing import Optional
	import pandas as pd
	from pdf2image import convert_from_path
	from PIL import Image
	from .config import settings
	from striprtf.striprtf import rtf_to_text
	from docx import Document


	class FileProcessor:
	def __init__(self):
	pass

	def process_pdf(self, file_path: str) -> str:
	#list of images
	return convert_from_path(file_path)

	def process_image(self, file_path: str) -> str:
	return Image.open(file_path).convert('RGB')

	def process_doc(self, file_path: str) -> str:
	def split_doc(data: str) -> list:
	line_data = data.split('\n')
	line_count = len(line_data)
	#split by three
	split_len = line_count//3
	#split by 3 and also handle the case where the split is not even, so that the last one has the remaining lines
	doc_split = [line_data[i:i+split_len] for i in range(0, line_count, split_len)]
	#under each split, merge the line data into a single line
	doc_split = ['\n'.join(split) for split in doc_split]
	return doc_split

	if file_path.lower().endswith('.doc'):
	with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
	data = rtf_to_text(file.read())
	if len(data.split('\n')) > 200:
	split_data = split_doc(data)
	return split_data
	else:
	return data
	else:
	doc = Document(file_path)
	return '\n'.join([paragraph.text for paragraph in doc.paragraphs])

	def process_docx(self, file_path: str) -> str:
	doc = Document(file_path)
	return '\n'.join([paragraph.text for paragraph in doc.paragraphs])

	def process_xlsx(self, file_path: str) -> str:
	df = pd.read_excel(file_path)
	return df.to_string()

	def process_csv(self, file_path: str) -> str:
	df = pd.read_csv(file_path)
	return df.to_string()

	def process_txt(self, file_path: str) -> str:
	with open(file_path, 'r') as file:
	return file.read()

	def process_file(self, file_path: str) -> str:
	"""Main method to process any supported file type."""
	_, file_extension = os.path.splitext(file_path)

	processors = {
	'.pdf': self.process_pdf,
	'.jpeg':self.process_image,
	'.jpg':self.process_image,
	'.png':self.process_image,
	'.doc': self.process_doc,
	'.docx': self.process_docx,
	'.xls': self.process_xlsx,
	'.xlsx': self.process_xlsx,
	'.csv': self.process_csv,
	'.txt': self.process_txt
	}

	processor = processors.get(file_extension.lower())
	if not processor:
	raise ValueError(f"Unsupported file format: {file_extension}")

	return processor(file_path)