Spaces:

yasirme
/

RAG-retrieval

Sleeping

File size: 3,508 Bytes

import io
import pdfplumber
import pandas as pd
import json
from docx import Document
from openpyxl import load_workbook
import re
import uuid 


class FileReader:
    def __init__(self):
        self.allowed_files = ["txt", "pdf", "docx", "md", "json", "csv", "xlsx", "xls"]
        self.max_chars_per_file = 5000000
    
    def calc_chars(self, files, allowed_chars):
        total_chars = 0
        clean_contents = []
        for file in files:
            file_extension = file.filename.split('.')[-1].lower()
            if file_extension not in self.allowed_files:
                return {"error": "unsupported file type uploaded"}, 400 
            try:
                if file_extension == 'txt' or file_extension=="md":
                    text = self._read_txt(file)
                elif file_extension == 'pdf':
                    text = self._read_pdf(file)
                elif file_extension == 'docx':
                    text = self._read_docx(file)
                elif file_extension == 'json':
                    text = self._read_json(file)
                elif file_extension == 'csv':
                    text = self._read_csv(file)
                elif file_extension in ['xlsx', 'xls']:
                    text = self._read_excel(file)

                if(len(text)>self.max_chars_per_file):
                    return {"error": "max 5 million characters per file allowed."} , 400
                clean_contents.append({                            
                    "type": file_extension,
                    "content": text,
                    "name": file.filename,
                    "id": str(uuid.uuid4()),
                    "total_chars": len(text)
                })
                total_chars += len(text)
                if(total_chars>int(allowed_chars)):
                    return {"error": "Total allowed characters limit reached"}, 400

            except Exception as e:
                return {"error": f"Error reading file {file.filename}: {e}"}, 500

        return {"total_chars": total_chars, "clean_contents": clean_contents}, 200

    def _read_txt(self, file):
        file_content = file.read().decode("utf-8")
        return self._clean_text(file_content)
    
    def _read_pdf(self, file):
        with pdfplumber.open(file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ''
        
        return self._clean_text(text)
    
    def _read_docx(self, file):
        doc = Document(file)
        text = ''
        for para in doc.paragraphs:
            text += para.text + "\n" 
        return self._clean_text(text)
    
    def _read_json(self, file):
        content = json.load(file)
        text = json.dumps(content, ensure_ascii=False) 
        return self._clean_text(text)
    
    def _read_csv(self, file):
        df = pd.read_csv(file)
        text = df.to_string(index=False)  
        return self._clean_text(text)
    
    def _read_excel(self, file):
        wb = load_workbook(file)
        text = ''
        for sheet in wb.sheetnames:
            ws = wb[sheet]
            for row in ws.iter_rows(values_only=True):
                text += ' | '.join(str(cell) if cell is not None else '' for cell in row) + "\n"
        return self._clean_text(text)

    def _clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)  
        text = re.sub(r'[^\x00-\x7F]+', '', text)  
        text = text.strip() 
        return text

file_reader = FileReader()