| import base64 |
| from io import BytesIO |
|
|
| class RequirementsIngest: |
| def __init__(self): |
| pass |
|
|
| def ingest_requirements_document(self, file_obj) -> dict: |
| """ |
| Ingest a requirements document from a file-like object. |
| Supports both TXT and PDF files. |
| |
| Returns: |
| dict: { |
| 'type': 'text' or 'pdf', |
| 'content': str (for text) or base64 string (for PDF), |
| 'filename': str, |
| 'text_content': str (extracted text for PDFs, same as content for TXT) |
| } |
| """ |
| try: |
| filename = getattr(file_obj, 'name', 'unknown') |
| file_extension = filename.lower().split('.')[-1] if '.' in filename else '' |
| |
| if file_extension == 'pdf': |
| |
| file_obj.seek(0) |
| pdf_content = file_obj.read() |
| |
| |
| pdf_base64 = base64.b64encode(pdf_content).decode('utf-8') |
| |
| |
| |
| try: |
| |
| try: |
| from PyPDF2 import PdfReader |
| import io |
| |
| |
| file_obj.seek(0) |
| pdf_content = file_obj.read() |
| pdf_stream = io.BytesIO(pdf_content) |
| |
| |
| reader = PdfReader(pdf_stream) |
| text_content = "" |
| for page in reader.pages: |
| text_content += page.extract_text() + "\n" |
| |
| if not text_content.strip(): |
| text_content = f"PDF Requirements Document: {filename} (no text content found)" |
| else: |
| |
| text_content = text_content[:1000] + "..." if len(text_content) > 1000 else text_content |
| |
| except ImportError: |
| |
| text_content = f"PDF Requirements Document: {filename} (PyPDF2 not available for text extraction)" |
| except Exception as e: |
| text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" |
| |
| except Exception as e: |
| text_content = f"PDF Requirements Document: {filename} (text extraction failed: {str(e)})" |
| |
| return { |
| 'type': 'pdf', |
| 'content': pdf_base64, |
| 'filename': filename, |
| 'text_content': text_content, |
| 'file_size': len(pdf_content) |
| } |
| else: |
| |
| file_obj.seek(0) |
| text = file_obj.read() |
| if isinstance(text, bytes): |
| text = text.decode("utf-8", errors="replace") |
| |
| return { |
| 'type': 'text', |
| 'content': text, |
| 'filename': filename, |
| 'text_content': text, |
| 'file_size': len(text.encode('utf-8')) |
| } |
| |
| except Exception as e: |
| raise ValueError(f"Error reading requirements document: {e}") |
|
|
|
|
|
|