invoice_processing / src /file_processor.py
krishnavadithya's picture
Upload 6 files
a36a2f6 verified
import os
from typing import Optional
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
from .config import settings
from striprtf.striprtf import rtf_to_text
from docx import Document
class FileProcessor:
def __init__(self):
pass
def process_pdf(self, file_path: str) -> str:
#list of images
return convert_from_path(file_path)
def process_image(self, file_path: str) -> str:
return Image.open(file_path).convert('RGB')
def process_doc(self, file_path: str) -> str:
def split_doc(data: str) -> list:
line_data = data.split('\n')
line_count = len(line_data)
#split by three
split_len = line_count//3
#split by 3 and also handle the case where the split is not even, so that the last one has the remaining lines
doc_split = [line_data[i:i+split_len] for i in range(0, line_count, split_len)]
#under each split, merge the line data into a single line
doc_split = ['\n'.join(split) for split in doc_split]
return doc_split
if file_path.lower().endswith('.doc'):
with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
data = rtf_to_text(file.read())
if len(data.split('\n')) > 200:
split_data = split_doc(data)
return split_data
else:
return data
else:
doc = Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
def process_docx(self, file_path: str) -> str:
doc = Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
def process_xlsx(self, file_path: str) -> str:
df = pd.read_excel(file_path)
return df.to_string()
def process_csv(self, file_path: str) -> str:
df = pd.read_csv(file_path)
return df.to_string()
def process_txt(self, file_path: str) -> str:
with open(file_path, 'r') as file:
return file.read()
def process_file(self, file_path: str) -> str:
"""Main method to process any supported file type."""
_, file_extension = os.path.splitext(file_path)
processors = {
'.pdf': self.process_pdf,
'.jpeg':self.process_image,
'.jpg':self.process_image,
'.png':self.process_image,
'.doc': self.process_doc,
'.docx': self.process_docx,
'.xls': self.process_xlsx,
'.xlsx': self.process_xlsx,
'.csv': self.process_csv,
'.txt': self.process_txt
}
processor = processors.get(file_extension.lower())
if not processor:
raise ValueError(f"Unsupported file format: {file_extension}")
return processor(file_path)