ibraheem007 commited on
Commit
a0f26db
Β·
verified Β·
1 Parent(s): 7ab96e5

Update components/file_processor.py

Browse files
Files changed (1) hide show
  1. components/file_processor.py +84 -64
components/file_processor.py CHANGED
@@ -8,86 +8,98 @@ import logging
8
  logger = logging.getLogger(__name__)
9
 
10
  def process_uploaded_file(uploaded_file):
11
- """Process uploaded file with Hugging Face Spaces compatible temp files"""
12
  logger.info(f"πŸ”„ Starting file processing: {uploaded_file.name}")
13
 
 
14
  try:
15
  file_extension = uploaded_file.name.lower()
16
- logger.info(f"πŸ“ File type: {file_extension}")
17
-
18
- # Use Streamlit's file handling instead of manual temp files
19
  file_content = uploaded_file.getvalue()
20
- logger.info(f"πŸ“Š File size: {len(file_content)} bytes")
 
 
 
 
21
 
22
  if file_extension.endswith('.pdf'):
23
- logger.info("πŸ“„ Processing PDF file...")
24
- # For PDFs, we still need to use temp files for PyMuPDF
25
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
26
- tmp.write(file_content)
27
- tmp_path = tmp.name
28
-
29
  try:
30
- full_text = extract_text_from_pdf(tmp_path)
31
- logger.info(f"βœ… PDF processed successfully, extracted {len(full_text)} characters")
 
 
 
 
 
 
32
  except Exception as pdf_error:
33
- logger.error(f"❌ PDF extraction failed: {pdf_error}")
34
- return None, f"Failed to extract text from PDF: {str(pdf_error)}"
35
- finally:
36
- # Always clean up temp file
37
- if os.path.exists(tmp_path):
38
- os.unlink(tmp_path)
39
- logger.info("🧹 Temporary PDF file cleaned up")
40
-
41
- elif file_extension.endswith('.pptx'):
42
- logger.info("πŸ“Š Processing PPTX file...")
43
- try:
44
- # For PPTX, use in-memory processing
45
- from io import BytesIO
46
- from pptx import Presentation
47
 
48
- pptx_file = BytesIO(file_content)
49
- prs = Presentation(pptx_file)
50
- full_text = ""
51
- for slide in prs.slides:
52
- for shape in slide.shapes:
53
- if hasattr(shape, "text"):
54
- full_text += shape.text + "\n"
55
- full_text = full_text.strip()
56
- logger.info(f"βœ… PPTX processed successfully, extracted {len(full_text)} characters")
57
- except Exception as pptx_error:
58
- logger.error(f"❌ PPTX extraction failed: {pptx_error}")
59
- return None, f"Failed to extract text from PowerPoint: {str(pptx_error)}"
60
 
61
  elif file_extension.endswith('.docx'):
62
- logger.info("πŸ“ Processing DOCX file...")
63
- try:
64
- # For DOCX, use in-memory processing
65
- from io import BytesIO
66
- from docx import Document
67
-
68
- docx_file = BytesIO(file_content)
69
- doc = Document(docx_file)
70
- full_text = "\n".join([para.text for para in doc.paragraphs])
71
- full_text = full_text.strip()
72
- logger.info(f"βœ… DOCX processed successfully, extracted {len(full_text)} characters")
73
- except Exception as docx_error:
74
- logger.error(f"❌ DOCX extraction failed: {docx_error}")
75
- return None, f"Failed to extract text from Word document: {str(docx_error)}"
76
  else:
77
- logger.error(f"❌ Unsupported file type: {file_extension}")
78
- return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX files."
79
 
80
  if not full_text.strip():
81
- logger.warning("⚠️ No text content extracted from file")
82
- return None, "No text could be extracted from the file. The file might be empty, contain only images, or be corrupted."
83
 
84
- logger.info("βœ… File processing completed successfully")
85
  return full_text, None
86
 
87
  except Exception as e:
88
- logger.error(f"❌ File processing failed: {str(e)}", exc_info=True)
89
  return None, f"Error processing file: {str(e)}"
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def get_student_content_input():
92
  """Get content input from student (file upload or text)"""
93
  st.subheader("πŸ“š Provide Your Learning Material")
@@ -108,12 +120,20 @@ def get_student_content_input():
108
  )
109
  if uploaded_file:
110
  with st.spinner("πŸ“– Reading your document..."):
 
 
 
111
  content_text, error = process_uploaded_file(uploaded_file)
112
- if error:
113
- st.error(f"❌ {error}")
114
- else:
115
- st.success("βœ… Document processed successfully!")
116
- filename = uploaded_file.name
 
 
 
 
 
117
  else:
118
  content_text = st.text_area(
119
  "Paste the content you want to simplify:",
 
8
  logger = logging.getLogger(__name__)
9
 
10
  def process_uploaded_file(uploaded_file):
11
+ """Process uploaded file with Hugging Face Spaces compatible approach"""
12
  logger.info(f"πŸ”„ Starting file processing: {uploaded_file.name}")
13
 
14
+ # Don't use temp files at all for Hugging Face Spaces
15
  try:
16
  file_extension = uploaded_file.name.lower()
 
 
 
17
  file_content = uploaded_file.getvalue()
18
+
19
+ logger.info(f"πŸ“ Processing {file_extension} file, size: {len(file_content)} bytes")
20
+
21
+ # For Hugging Face Spaces, use BytesIO for everything
22
+ from io import BytesIO
23
 
24
  if file_extension.endswith('.pdf'):
25
+ # PDFs need temp files for PyMuPDF, but let's try a different approach
26
+ logger.info("πŸ“„ Processing PDF with direct bytes...")
 
 
 
 
27
  try:
28
+ # Try using PyMuPDF with bytes
29
+ import fitz
30
+ doc = fitz.open(stream=file_content, filetype="pdf")
31
+ full_text = ""
32
+ for page in doc:
33
+ full_text += page.get_text()
34
+ doc.close()
35
+ logger.info(f"βœ… PDF processed: {len(full_text)} chars")
36
  except Exception as pdf_error:
37
+ logger.error(f"❌ PDF bytes failed: {pdf_error}")
38
+ # Fallback to very simple temp file approach
39
+ return process_pdf_with_minimal_temp(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ elif file_extension.endswith('.pptx'):
42
+ logger.info("πŸ“Š Processing PPTX with BytesIO...")
43
+ from pptx import Presentation
44
+ pptx_file = BytesIO(file_content)
45
+ prs = Presentation(pptx_file)
46
+ full_text = ""
47
+ for slide in prs.slides:
48
+ for shape in slide.shapes:
49
+ if hasattr(shape, "text") and shape.text:
50
+ full_text += shape.text + "\n"
51
+ full_text = full_text.strip()
52
+ logger.info(f"βœ… PPTX processed: {len(full_text)} chars")
53
 
54
  elif file_extension.endswith('.docx'):
55
+ logger.info("πŸ“ Processing DOCX with BytesIO...")
56
+ from docx import Document
57
+ docx_file = BytesIO(file_content)
58
+ doc = Document(docx_file)
59
+ full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
60
+ full_text = full_text.strip()
61
+ logger.info(f"βœ… DOCX processed: {len(full_text)} chars")
62
+
 
 
 
 
 
 
63
  else:
64
+ return None, "Unsupported file type. Please upload PDF, PPTX, or DOCX."
 
65
 
66
  if not full_text.strip():
67
+ return None, "No text could be extracted from the file."
 
68
 
 
69
  return full_text, None
70
 
71
  except Exception as e:
72
+ logger.error(f"❌ File processing failed: {str(e)}")
73
  return None, f"Error processing file: {str(e)}"
74
 
75
+ def process_pdf_with_minimal_temp(uploaded_file):
76
+ """Minimal temp file approach for PDFs as last resort"""
77
+ try:
78
+ # Use Streamlit's temp file handling
79
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
80
+ tmp_file.write(uploaded_file.getvalue())
81
+ tmp_path = tmp_file.name
82
+
83
+ # Extract text
84
+ full_text = extract_text_from_pdf(tmp_path)
85
+
86
+ # Immediate cleanup
87
+ try:
88
+ os.unlink(tmp_path)
89
+ except:
90
+ pass
91
+
92
+ return full_text, None
93
+
94
+ except Exception as e:
95
+ # Cleanup on error
96
+ try:
97
+ if 'tmp_path' in locals() and os.path.exists(tmp_path):
98
+ os.unlink(tmp_path)
99
+ except:
100
+ pass
101
+ return None, f"PDF processing failed: {str(e)}"
102
+
103
  def get_student_content_input():
104
  """Get content input from student (file upload or text)"""
105
  st.subheader("πŸ“š Provide Your Learning Material")
 
120
  )
121
  if uploaded_file:
122
  with st.spinner("πŸ“– Reading your document..."):
123
+ # Add debug info
124
+ st.write(f"πŸ“ Testing file: {uploaded_file.name} ({len(uploaded_file.getvalue())} bytes)")
125
+
126
  content_text, error = process_uploaded_file(uploaded_file)
127
+
128
+ if error:
129
+ st.error(f"❌ {error}")
130
+ # Show debug info
131
+ with st.expander("πŸ”§ Debug Info"):
132
+ st.write(f"File type: {uploaded_file.type}")
133
+ st.write(f"File size: {len(uploaded_file.getvalue())} bytes")
134
+ else:
135
+ st.success("βœ… Document processed successfully!")
136
+ filename = uploaded_file.name
137
  else:
138
  content_text = st.text_area(
139
  "Paste the content you want to simplify:",