import PyPDF2 from openpyxl import load_workbook from pptx import Presentation import gradio as gr import io import re import zipfile import xml.etree.ElementTree as ET import filetype import requests import os import mimetypes # Constants CHUNK_SIZE = 32000 # --- Utility Functions --- def xml2text(xml): """Extracts text from XML data.""" text = u'' root = ET.fromstring(xml) for child in root.iter(): text += child.text + " " if child.text is not None else '' return text def clean_text(content): """Cleans text content based on the 'clean' parameter.""" content = content.replace('\n', ' ') content = content.replace('\r', ' ') content = content.replace('\t', ' ') content = re.sub(r'\s+', ' ', content) return content def split_content(content, chunk_size=CHUNK_SIZE): """Splits content into chunks of a specified size.""" chunks = [] for i in range(0, len(content), chunk_size): chunks.append(content[i:i + chunk_size]) return chunks # --- Document Reading Functions --- def extract_text_from_docx(docx_data, clean=True): """Extracts text from DOCX files.""" text = u'' zipf = zipfile.ZipFile(io.BytesIO(docx_data)) filelist = zipf.namelist() header_xmls = 'word/header[0-9]*.xml' for fname in filelist: if re.match(header_xmls, fname): text += xml2text(zipf.read(fname)) doc_xml = 'word/document.xml' text += xml2text(zipf.read(doc_xml)) footer_xmls = 'word/footer[0-9]*.xml' for fname in filelist: if re.match(footer_xmls, fname): text += xml2text(zipf.read(fname)) zipf.close() if clean: text = clean_text(text) return text, len(text) def extract_text_from_pptx(pptx_data, clean=True): """Extracts text from PPT files.""" text = u'' zipf = zipfile.ZipFile(io.BytesIO(pptx_data)) filelist = zipf.namelist() # Extract text from slide notes notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml' for fname in filelist: if re.match(notes_xmls, fname): text += xml2text(zipf.read(fname)) # Extract text from slide content (shapes and text boxes) slide_xmls = 'ppt/slides/slide[0-9]*.xml' for fname in filelist: if re.match(slide_xmls, fname): text += xml2text(zipf.read(fname)) zipf.close() if clean: text = clean_text(text) return text, len(text) def read_document(file_path, clean=True): with open(file_path, "rb") as f: file_content = f.read() kind = filetype.guess(file_content) if kind is None: mime = "text" else: mime = kind.mime if mime == "application/pdf": try: pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) content = '' for page in range(len(pdf_reader.pages)): content += pdf_reader.pages[page].extract_text() if clean: content = clean_text(content) return content, len(repr(content)) except Exception as e: return f"Error reading PDF: {e}", 0 elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": try: wb = load_workbook(io.BytesIO(file_content)) content = '' for sheet in wb.worksheets: for row in sheet.rows: for cell in row: if cell.value is not None: content += str(cell.value) + ' ' if clean: content = clean_text(content) return content, len(repr(content)) except Exception as e: return f"Error reading XLSX: {e}", 0 elif mime == "text/plain": try: content = file_content.decode('utf-8') if clean: content = clean_text(content) return content, len(repr(content)) except Exception as e: return f"Error reading TXT file: {e}", 0 elif mime == "text/csv": try: content = file_content.decode('utf-8') if clean: content = clean_text(content) return content, len(repr(content)) except Exception as e: return f"Error reading CSV file: {e}", 0 elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": try: return extract_text_from_docx(file_content, clean) except Exception as e: return f"Error reading DOCX: {e}", 0 elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation": try: return extract_text_from_pptx(file_content, clean) except Exception as e: return f"Error reading PPTX: {e}", 0 else: try: content = file_content.decode('utf-8') if clean: content = clean_text(content) return content, len(repr(content)) except Exception as e: return f"Error reading file: {e}", 0 def download_and_process_file(url, clean=True): """Downloads a file from a URL and returns the local file path.""" if not url.startswith("http://") and not url.startswith("https://"): url = "http://" + url # Prepend "http://" if not present try: response = requests.get(url, stream=True) response.raise_for_status() # Raise an exception for bad status codes # Generate a safe and unique temporary filename original_filename = os.path.basename(url) # Remove invalid characters from filename safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename) temp_filename = f"{safe_filename}" # Infer file extension from content type content_type = response.headers['content-type'] ext = mimetypes.guess_extension(content_type) if ext and not temp_filename.endswith(ext): # Append extension if not already present temp_filename += ext with open(temp_filename, 'wb') as f: for chunk in response.iter_content(chunk_size=8192000): f.write(chunk) # Check if it's an image type kind = filetype.guess(temp_filename) if kind and kind.mime.startswith('image/'): return f"![]({url})", 0 # Return markdown image syntax if it's an image else: return read_document(temp_filename, clean) # Otherwise, process as a document except requests.exceptions.MissingSchema: return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0 except requests.exceptions.ConnectionError: return "Error: Could not connect to the server. Please check your internet connection.", 0 except requests.exceptions.RequestException as e: return f"Error downloading file: {e}", 0 # --- Gradio Interface --- iface = gr.Interface( fn=download_and_process_file, inputs=[ gr.Textbox(lines=1, placeholder="Enter URL of the file"), gr.Checkbox(label="Clean Text", value=True), ], outputs=[ gr.Textbox(label="Document Content/Image Markdown"), gr.Number(label="Document Length (characters)"), ], title="Enhanced File Processor for Hugging Face Chat Tools", description="Enter the URL of site and extract its content" "This tool is designed for use with Hugging Face Chat Tools: " "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)", concurrency_limit=None ) iface.launch()