Spaces:

ibraheem007
/

tailored

Running

App Files Files Community

ibraheem007 commited on 25 days ago

Commit

a0f26db

verified ·

1 Parent(s): 7ab96e5

Update components/file_processor.py

Browse files

Files changed (1) hide show

components/file_processor.py +84 -64

components/file_processor.py CHANGED Viewed

@@ -8,86 +8,98 @@ import logging
 logger = logging.getLogger(__name__)
 def process_uploaded_file(uploaded_file):
-    """Process uploaded file with Hugging Face Spaces compatible temp files"""
     logger.info(f"🔄 Starting file processing: {uploaded_file.name}")
     try:
         file_extension = uploaded_file.name.lower()
-        logger.info(f"📁 File type: {file_extension}")
-        # Use Streamlit's file handling instead of manual temp files
         file_content = uploaded_file.getvalue()
-        logger.info(f"📊 File size: {len(file_content)} bytes")
         if file_extension.endswith('.pdf'):
-            logger.info("📄 Processing PDF file...")
-            # For PDFs, we still need to use temp files for PyMuPDF
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
-                tmp.write(file_content)
-                tmp_path = tmp.name
             try:
-                full_text = extract_text_from_pdf(tmp_path)
-                logger.info(f"✅ PDF processed successfully, extracted {len(full_text)} characters")
             except Exception as pdf_error:
-                logger.error(f"❌ PDF extraction failed: {pdf_error}")
-                return None, f"Failed to extract text from PDF: {str(pdf_error)}"
-            finally:
-                # Always clean up temp file
-                if os.path.exists(tmp_path):
-                    os.unlink(tmp_path)
-                    logger.info("🧹 Temporary PDF file cleaned up")
-        elif file_extension.endswith('.pptx'):
-            logger.info("📊 Processing PPTX file...")
-            try:
-                # For PPTX, use in-memory processing
-                from io import BytesIO
-                from pptx import Presentation
-                pptx_file = BytesIO(file_content)
-                prs = Presentation(pptx_file)
-                full_text = ""
-                for slide in prs.slides:
-                    for shape in slide.shapes:
-                        if hasattr(shape, "text"):
-                            full_text += shape.text + "\n"
-                full_text = full_text.strip()
-                logger.info(f"✅ PPTX processed successfully, extracted {len(full_text)} characters")
-            except Exception as pptx_error:
-                logger.error(f"❌ PPTX extraction failed: {pptx_error}")
-                return None, f"Failed to extract text from PowerPoint: {str(pptx_error)}"
         elif file_extension.endswith('.docx'):
-            logger.info("📝 Processing DOCX file...")
-            try:
-                # For DOCX, use in-memory processing
-                from io import BytesIO
-                from docx import Document
-                docx_file = BytesIO(file_content)
-                doc = Document(docx_file)
-                full_text = "\n".join([para.text for para in doc.paragraphs])
-                full_text = full_text.strip()
-                logger.info(f"✅ DOCX processed successfully, extracted {len(full_text)} characters")
-            except Exception as docx_error:
-                logger.error(f"❌ DOCX extraction failed: {docx_error}")
-                return None, f"Failed to extract text from Word document: {str(docx_error)}"
         else:
-            logger.error(f"❌ Unsupported file type: {file_extension}")
-            return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX files."
         if not full_text.strip():
-            logger.warning("⚠️ No text content extracted from file")
-            return None, "No text could be extracted from the file. The file might be empty, contain only images, or be corrupted."
-        logger.info("✅ File processing completed successfully")
         return full_text, None
     except Exception as e:
-        logger.error(f"❌ File processing failed: {str(e)}", exc_info=True)
         return None, f"Error processing file: {str(e)}"
 def get_student_content_input():
     """Get content input from student (file upload or text)"""
     st.subheader("📚 Provide Your Learning Material")
@@ -108,12 +120,20 @@ def get_student_content_input():
         )
         if uploaded_file:
             with st.spinner("📖 Reading your document..."):
                 content_text, error = process_uploaded_file(uploaded_file)
-            if error:
-                st.error(f"❌ {error}")
-            else:
-                st.success("✅ Document processed successfully!")
-                filename = uploaded_file.name
     else:
         content_text = st.text_area(
             "Paste the content you want to simplify:",

 logger = logging.getLogger(__name__)
 def process_uploaded_file(uploaded_file):
+    """Process uploaded file with Hugging Face Spaces compatible approach"""
     logger.info(f"🔄 Starting file processing: {uploaded_file.name}")
+    # Don't use temp files at all for Hugging Face Spaces
     try:
         file_extension = uploaded_file.name.lower()
         file_content = uploaded_file.getvalue()
+        logger.info(f"📁 Processing {file_extension} file, size: {len(file_content)} bytes")
+        # For Hugging Face Spaces, use BytesIO for everything
+        from io import BytesIO
         if file_extension.endswith('.pdf'):
+            # PDFs need temp files for PyMuPDF, but let's try a different approach
+            logger.info("📄 Processing PDF with direct bytes...")
             try:
+                # Try using PyMuPDF with bytes
+                import fitz
+                doc = fitz.open(stream=file_content, filetype="pdf")
+                full_text = ""
+                for page in doc:
+                    full_text += page.get_text()
+                doc.close()
+                logger.info(f"✅ PDF processed: {len(full_text)} chars")
             except Exception as pdf_error:
+                logger.error(f"❌ PDF bytes failed: {pdf_error}")
+                # Fallback to very simple temp file approach
+                return process_pdf_with_minimal_temp(uploaded_file)
+        elif file_extension.endswith('.pptx'):
+            logger.info("📊 Processing PPTX with BytesIO...")
+            from pptx import Presentation
+            pptx_file = BytesIO(file_content)
+            prs = Presentation(pptx_file)
+            full_text = ""
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text:
+                        full_text += shape.text + "\n"
+            full_text = full_text.strip()
+            logger.info(f"✅ PPTX processed: {len(full_text)} chars")
         elif file_extension.endswith('.docx'):
+            logger.info("📝 Processing DOCX with BytesIO...")
+            from docx import Document
+            docx_file = BytesIO(file_content)
+            doc = Document(docx_file)
+            full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+            full_text = full_text.strip()
+            logger.info(f"✅ DOCX processed: {len(full_text)} chars")
         else:
+            return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX."
         if not full_text.strip():
+            return None, "No text could be extracted from the file."
         return full_text, None
     except Exception as e:
+        logger.error(f"❌ File processing failed: {str(e)}")
         return None, f"Error processing file: {str(e)}"
+def process_pdf_with_minimal_temp(uploaded_file):
+    """Minimal temp file approach for PDFs as last resort"""
+    try:
+        # Use Streamlit's temp file handling
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            tmp_path = tmp_file.name
+        # Extract text
+        full_text = extract_text_from_pdf(tmp_path)
+        # Immediate cleanup
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+        return full_text, None
+    except Exception as e:
+        # Cleanup on error
+        try:
+            if 'tmp_path' in locals() and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+        except:
+            pass
+        return None, f"PDF processing failed: {str(e)}"
 def get_student_content_input():
     """Get content input from student (file upload or text)"""
     st.subheader("📚 Provide Your Learning Material")
         )
         if uploaded_file:
             with st.spinner("📖 Reading your document..."):
+                # Add debug info
+                st.write(f"📁 Testing file: {uploaded_file.name} ({len(uploaded_file.getvalue())} bytes)")
                 content_text, error = process_uploaded_file(uploaded_file)
+                if error:
+                    st.error(f"❌ {error}")
+                    # Show debug info
+                    with st.expander("🔧 Debug Info"):
+                        st.write(f"File type: {uploaded_file.type}")
+                        st.write(f"File size: {len(uploaded_file.getvalue())} bytes")
+                else:
+                    st.success("✅ Document processed successfully!")
+                    filename = uploaded_file.name
     else:
         content_text = st.text_area(
             "Paste the content you want to simplify:",