mtyrrell commited on
Commit
537051a
Β·
1 Parent(s): 4cd2e8f

basic chunking

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. Dockerfile +39 -0
  3. app/main.py +546 -0
  4. app/utils.py +0 -0
  5. params.cfg +0 -0
  6. requirements.txt +14 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------- base image --------
2
+ FROM python:3.10-slim
3
+
4
+ ENV PYTHONUNBUFFERED=1 \
5
+ OMP_NUM_THREADS=1 \
6
+ TOKENIZERS_PARALLELISM=false
7
+
8
+
9
+ # ---------- Create Non-Root User ----------
10
+ # Ensures proper file permissions for dev and runtime
11
+ RUN useradd -m -u 1000 user
12
+
13
+
14
+ # -------- install deps --------
15
+ WORKDIR /app
16
+
17
+ # ---------- Install Python Dependencies ----------
18
+ # Copy requirements and install as non-root user
19
+ COPY --chown=user requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Install system dependencies for document processing
23
+ # RUN apt-get update && apt-get install -y \
24
+ # build-essential \
25
+ # && rm -rf /var/lib/apt/lists/*
26
+
27
+ # ---------- Copy Project Files ----------
28
+ # Set appropriate ownership and permissions
29
+ COPY --link --chown=1000 . .
30
+
31
+
32
+ # Create directories for document storage
33
+ RUN mkdir -p uploaded_docs processed_docs
34
+
35
+ # Expose Gradio default port
36
+ EXPOSE 7860 7863
37
+
38
+ # Launch with unbuffered output
39
+ CMD ["python", "-m", "app.main"]
app/main.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI, UploadFile, File, HTTPException
3
+ from pydantic import BaseModel
4
+ from typing import Optional, Dict, Any, List
5
+ import uvicorn
6
+ import os
7
+ import hashlib
8
+ import logging
9
+ from datetime import datetime
10
+ from contextlib import asynccontextmanager
11
+ import json
12
+ import re
13
+ from pathlib import Path
14
+
15
+ # Document processing imports
16
+ import PyPDF2
17
+ from docx import Document as DocxDocument
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Create directories for document storage
24
+ UPLOAD_DIR = Path("uploaded_docs")
25
+ PROCESSED_DIR = Path("processed_docs")
26
+ UPLOAD_DIR.mkdir(exist_ok=True)
27
+ PROCESSED_DIR.mkdir(exist_ok=True)
28
+
29
+ # Models
30
+ class IngestRequest(BaseModel):
31
+ doc_id: str
32
+ file_content: bytes
33
+ filename: str
34
+ content_type: str
35
+
36
+ class IngestResponse(BaseModel):
37
+ doc_id: str
38
+ chunks_indexed: int
39
+ status: str
40
+ metadata: Dict[str, Any]
41
+
42
+ class DocumentChunk(BaseModel):
43
+ doc_id: str
44
+ chunk_id: str
45
+ content: str
46
+ metadata: Dict[str, Any]
47
+
48
+ # Global storage for processed documents (in production, use proper vector store)
49
+ DOCUMENT_STORE: Dict[str, List[DocumentChunk]] = {}
50
+ DOCUMENT_METADATA: Dict[str, Dict[str, Any]] = {}
51
+
52
+ def extract_text_from_pdf(file_path: str) -> tuple[str, Dict[str, Any]]:
53
+ """Extract text from PDF file"""
54
+ try:
55
+ with open(file_path, 'rb') as file:
56
+ pdf_reader = PyPDF2.PdfReader(file)
57
+ text = ""
58
+ metadata = {
59
+ "total_pages": len(pdf_reader.pages),
60
+ "page_texts": []
61
+ }
62
+
63
+ for page_num, page in enumerate(pdf_reader.pages):
64
+ page_text = page.extract_text()
65
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
66
+ metadata["page_texts"].append({
67
+ "page": page_num + 1,
68
+ "text": page_text,
69
+ "char_count": len(page_text)
70
+ })
71
+
72
+ return text, metadata
73
+ except Exception as e:
74
+ logger.error(f"PDF extraction error: {str(e)}")
75
+ raise Exception(f"Failed to extract text from PDF: {str(e)}")
76
+
77
+ def extract_text_from_docx(file_path: str) -> tuple[str, Dict[str, Any]]:
78
+ """Extract text from DOCX file"""
79
+ try:
80
+ doc = DocxDocument(file_path)
81
+ text = ""
82
+ metadata = {
83
+ "total_paragraphs": 0,
84
+ "paragraph_texts": []
85
+ }
86
+
87
+ for i, paragraph in enumerate(doc.paragraphs):
88
+ if paragraph.text.strip():
89
+ text += f"{paragraph.text}\n"
90
+ metadata["paragraph_texts"].append({
91
+ "paragraph": i + 1,
92
+ "text": paragraph.text,
93
+ "char_count": len(paragraph.text)
94
+ })
95
+ metadata["total_paragraphs"] += 1
96
+
97
+ return text, metadata
98
+ except Exception as e:
99
+ logger.error(f"DOCX extraction error: {str(e)}")
100
+ raise Exception(f"Failed to extract text from DOCX: {str(e)}")
101
+
102
+ def simple_text_splitter(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
103
+ """Simple text splitter without external dependencies"""
104
+ if not text:
105
+ return []
106
+
107
+ # Split by common separators in order of preference
108
+ separators = ["\n\n", "\n", ". ", "! ", "? ", " "]
109
+
110
+ def split_text_recursive(text: str, separators: List[str]) -> List[str]:
111
+ if not separators:
112
+ # If no separators left, split by character count
113
+ chunks = []
114
+ for i in range(0, len(text), chunk_size - chunk_overlap):
115
+ chunk = text[i:i + chunk_size]
116
+ if chunk.strip():
117
+ chunks.append(chunk.strip())
118
+ return chunks
119
+
120
+ separator = separators[0]
121
+ remaining_separators = separators[1:]
122
+
123
+ splits = text.split(separator)
124
+ chunks = []
125
+ current_chunk = ""
126
+
127
+ for split in splits:
128
+ # If adding this split would exceed chunk_size
129
+ if len(current_chunk) + len(split) + len(separator) > chunk_size:
130
+ if current_chunk:
131
+ # If current chunk is still too big, recursively split it
132
+ if len(current_chunk) > chunk_size:
133
+ sub_chunks = split_text_recursive(current_chunk, remaining_separators)
134
+ chunks.extend(sub_chunks)
135
+ else:
136
+ chunks.append(current_chunk.strip())
137
+ current_chunk = split
138
+ else:
139
+ if current_chunk:
140
+ current_chunk += separator + split
141
+ else:
142
+ current_chunk = split
143
+
144
+ # Add the last chunk
145
+ if current_chunk:
146
+ if len(current_chunk) > chunk_size:
147
+ sub_chunks = split_text_recursive(current_chunk, remaining_separators)
148
+ chunks.extend(sub_chunks)
149
+ else:
150
+ chunks.append(current_chunk.strip())
151
+
152
+ return chunks
153
+
154
+ # Split the text
155
+ initial_chunks = split_text_recursive(text, separators)
156
+
157
+ # Add overlap between chunks
158
+ final_chunks = []
159
+ for i, chunk in enumerate(initial_chunks):
160
+ if i > 0 and chunk_overlap > 0:
161
+ # Add overlap from previous chunk
162
+ prev_chunk = initial_chunks[i-1]
163
+ overlap = prev_chunk[-chunk_overlap:] if len(prev_chunk) > chunk_overlap else prev_chunk
164
+ chunk = overlap + " " + chunk
165
+ final_chunks.append(chunk)
166
+
167
+ return [chunk for chunk in final_chunks if chunk.strip()]
168
+
169
+ def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
170
+ """Clean text and split into chunks"""
171
+ # Basic text cleaning
172
+ text = re.sub(r'\n+', '\n', text) # Remove multiple newlines
173
+ text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
174
+ text = text.strip()
175
+
176
+ # Split text into chunks using simple splitter
177
+ chunks = simple_text_splitter(text, chunk_size=500, chunk_overlap=50)
178
+
179
+ # Create DocumentChunk objects
180
+ document_chunks = []
181
+ for i, chunk_text in enumerate(chunks):
182
+ chunk = DocumentChunk(
183
+ doc_id=doc_id,
184
+ chunk_id=f"{doc_id}_chunk_{i}",
185
+ content=chunk_text,
186
+ metadata={
187
+ "chunk_index": i,
188
+ "chunk_length": len(chunk_text),
189
+ "created_at": datetime.now().isoformat()
190
+ }
191
+ )
192
+ document_chunks.append(chunk)
193
+
194
+ return document_chunks
195
+
196
+ def generate_doc_id(filename: str, content: bytes) -> str:
197
+ """Generate unique document ID"""
198
+ # Create hash from content for uniqueness
199
+ content_hash = hashlib.md5(content).hexdigest()[:8]
200
+ # Clean filename
201
+ clean_name = re.sub(r'[^a-zA-Z0-9._-]', '_', filename)
202
+ # Remove extension
203
+ name_without_ext = os.path.splitext(clean_name)[0]
204
+ # Create doc_id
205
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
206
+ return f"{timestamp}_{name_without_ext}_{content_hash}"
207
+
208
+ def process_document(file_content: bytes, filename: str) -> IngestResponse:
209
+ """Main document processing function"""
210
+ start_time = datetime.now()
211
+
212
+ try:
213
+ # Generate document ID
214
+ doc_id = generate_doc_id(filename, file_content)
215
+
216
+ # Save uploaded file temporarily
217
+ file_extension = os.path.splitext(filename)[1].lower()
218
+ temp_file_path = UPLOAD_DIR / f"{doc_id}{file_extension}"
219
+
220
+ with open(temp_file_path, 'wb') as f:
221
+ f.write(file_content)
222
+
223
+ # Extract text based on file type
224
+ if file_extension == '.pdf':
225
+ text, extraction_metadata = extract_text_from_pdf(str(temp_file_path))
226
+ elif file_extension == '.docx':
227
+ text, extraction_metadata = extract_text_from_docx(str(temp_file_path))
228
+ else:
229
+ raise ValueError(f"Unsupported file type: {file_extension}")
230
+
231
+ # Clean and chunk text
232
+ chunks = clean_and_chunk_text(text, doc_id)
233
+
234
+ # Store chunks (in production, this would go to vector store)
235
+ DOCUMENT_STORE[doc_id] = chunks
236
+
237
+ # Store metadata
238
+ processing_time = (datetime.now() - start_time).total_seconds()
239
+ DOCUMENT_METADATA[doc_id] = {
240
+ "filename": filename,
241
+ "doc_id": doc_id,
242
+ "file_type": file_extension,
243
+ "processing_time": processing_time,
244
+ "total_text_length": len(text),
245
+ "chunks_count": len(chunks),
246
+ "extraction_metadata": extraction_metadata,
247
+ "processed_at": datetime.now().isoformat(),
248
+ "status": "ready"
249
+ }
250
+
251
+ # Clean up temporary file
252
+ temp_file_path.unlink()
253
+
254
+ # Save processed document
255
+ processed_file_path = PROCESSED_DIR / f"{doc_id}.json"
256
+ with open(processed_file_path, 'w') as f:
257
+ json.dump({
258
+ "metadata": DOCUMENT_METADATA[doc_id],
259
+ "chunks": [chunk.dict() for chunk in chunks]
260
+ }, f, indent=2)
261
+
262
+ logger.info(f"Successfully processed document {doc_id}: {len(chunks)} chunks")
263
+
264
+ return IngestResponse(
265
+ doc_id=doc_id,
266
+ chunks_indexed=len(chunks),
267
+ status="ready",
268
+ metadata=DOCUMENT_METADATA[doc_id]
269
+ )
270
+
271
+ except Exception as e:
272
+ logger.error(f"Document processing failed: {str(e)}")
273
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
274
+
275
+ def get_document_context(doc_id: str, max_chunks: int = 10) -> str:
276
+ """Retrieve document context for a given doc_id"""
277
+ if doc_id not in DOCUMENT_STORE:
278
+ return f"Document {doc_id} not found."
279
+
280
+ chunks = DOCUMENT_STORE[doc_id][:max_chunks]
281
+ context_parts = []
282
+
283
+ for chunk in chunks:
284
+ context_parts.append(f"[Chunk {chunk.metadata['chunk_index']}]: {chunk.content}")
285
+
286
+ return "\n\n".join(context_parts)
287
+
288
+ # Gradio functions
289
+ def gradio_upload_and_process(file):
290
+ """Process uploaded file through Gradio"""
291
+ if file is None:
292
+ return "No file uploaded", "", ""
293
+
294
+ try:
295
+ with open(file.name, 'rb') as f:
296
+ file_content = f.read()
297
+
298
+ filename = os.path.basename(file.name)
299
+ result = process_document(file_content, filename)
300
+
301
+ # Format response for Gradio
302
+ response_text = f"""
303
+ βœ… Document processed successfully!
304
+
305
+ πŸ“„ Document ID: {result.doc_id}
306
+ πŸ“Š Chunks created: {result.chunks_indexed}
307
+ ⏱️ Processing time: {result.metadata['processing_time']:.2f}s
308
+ πŸ“ Total text length: {result.metadata['total_text_length']} characters
309
+ πŸ“‘ File type: {result.metadata['file_type']}
310
+
311
+ Status: {result.status}
312
+ """
313
+
314
+ # Get the processed chunks for display
315
+ chunks = DOCUMENT_STORE.get(result.doc_id, [])
316
+ chunks_display = ""
317
+ if chunks:
318
+ chunks_display = "πŸ“„ Processed Chunks:\n\n"
319
+ for i, chunk in enumerate(chunks[:10]): # Show first 10 chunks
320
+ chunks_display += f"--- Chunk {i+1} ---\n"
321
+ chunks_display += f"Length: {len(chunk.content)} characters\n"
322
+ chunks_display += f"Content: {chunk.content[:200]}{'...' if len(chunk.content) > 200 else ''}\n\n"
323
+
324
+ if len(chunks) > 10:
325
+ chunks_display += f"... and {len(chunks) - 10} more chunks\n"
326
+
327
+ return response_text, result.doc_id, chunks_display
328
+
329
+ except Exception as e:
330
+ error_msg = f"❌ Error processing document: {str(e)}"
331
+ logger.error(error_msg)
332
+ return error_msg, "", ""
333
+
334
+ def gradio_get_context(doc_id: str, max_chunks: int = 5):
335
+ """Get document context through Gradio"""
336
+ if not doc_id.strip():
337
+ return "Please enter a document ID"
338
+
339
+ try:
340
+ context = get_document_context(doc_id.strip(), max_chunks)
341
+ return f"πŸ“„ Context for document '{doc_id}':\n\n{context}"
342
+ except Exception as e:
343
+ return f"❌ Error retrieving context: {str(e)}"
344
+
345
+ def list_documents():
346
+ """List all processed documents"""
347
+ if not DOCUMENT_METADATA:
348
+ return "No documents processed yet."
349
+
350
+ doc_list = []
351
+ for doc_id, metadata in DOCUMENT_METADATA.items():
352
+ doc_list.append(f"β€’ {doc_id} ({metadata['filename']}) - {metadata['chunks_count']} chunks")
353
+
354
+ return "πŸ“š Processed Documents:\n\n" + "\n".join(doc_list)
355
+
356
+ # Create Gradio interface
357
+ def create_gradio_interface():
358
+ with gr.Blocks(title="ChatFed Document Ingestion", theme=gr.themes.Soft()) as demo:
359
+ gr.Markdown("# πŸ“š ChatFed Document Ingestion Module")
360
+ gr.Markdown("Upload PDF or DOCX files to make them available for retrieval.")
361
+
362
+ with gr.Tab("πŸ“€ Upload Document"):
363
+ with gr.Row():
364
+ with gr.Column():
365
+ file_input = gr.File(
366
+ label="Upload PDF or DOCX file",
367
+ file_types=[".pdf", ".docx"]
368
+ )
369
+ process_btn = gr.Button("πŸ”„ Process Document", variant="primary")
370
+
371
+ with gr.Column():
372
+ result_output = gr.Textbox(
373
+ label="Processing Result",
374
+ lines=8,
375
+ interactive=False
376
+ )
377
+ doc_id_output = gr.Textbox(
378
+ label="Document ID",
379
+ interactive=False
380
+ )
381
+
382
+ # Add a new section for displaying chunks
383
+ with gr.Row():
384
+ chunks_output = gr.Textbox(
385
+ label="Processed Chunks Preview",
386
+ lines=15,
387
+ interactive=False
388
+ )
389
+
390
+ process_btn.click(
391
+ fn=gradio_upload_and_process,
392
+ inputs=[file_input],
393
+ outputs=[result_output, doc_id_output, chunks_output]
394
+ )
395
+
396
+ with gr.Tab("πŸ” View Document"):
397
+ with gr.Row():
398
+ with gr.Column():
399
+ doc_id_input = gr.Textbox(
400
+ label="Document ID",
401
+ placeholder="Enter document ID to view context..."
402
+ )
403
+ max_chunks_input = gr.Slider(
404
+ label="Max Chunks to Display",
405
+ minimum=1,
406
+ maximum=20,
407
+ value=5,
408
+ step=1
409
+ )
410
+ view_btn = gr.Button("πŸ‘€ View Context", variant="secondary")
411
+
412
+ with gr.Column():
413
+ context_output = gr.Textbox(
414
+ label="Document Context",
415
+ lines=15,
416
+ interactive=False
417
+ )
418
+
419
+ view_btn.click(
420
+ fn=gradio_get_context,
421
+ inputs=[doc_id_input, max_chunks_input],
422
+ outputs=[context_output]
423
+ )
424
+
425
+ with gr.Tab("πŸ“‹ Document List"):
426
+ with gr.Column():
427
+ refresh_btn = gr.Button("πŸ”„ Refresh List")
428
+ doc_list_output = gr.Textbox(
429
+ label="All Documents",
430
+ lines=10,
431
+ interactive=False
432
+ )
433
+
434
+ refresh_btn.click(
435
+ fn=list_documents,
436
+ inputs=[],
437
+ outputs=[doc_list_output]
438
+ )
439
+
440
+ # Load initial list
441
+ demo.load(fn=list_documents, inputs=[], outputs=[doc_list_output])
442
+
443
+ return demo
444
+
445
+ # FastAPI setup
446
+ @asynccontextmanager
447
+ async def lifespan(app: FastAPI):
448
+ logger.info("Document Ingestion Module starting up...")
449
+ yield
450
+ logger.info("Document Ingestion Module shutting down...")
451
+
452
+ app = FastAPI(
453
+ title="ChatFed Document Ingestion",
454
+ version="1.0.0",
455
+ lifespan=lifespan
456
+ )
457
+
458
+ @app.get("/health")
459
+ async def health_check():
460
+ return {"status": "healthy", "documents_processed": len(DOCUMENT_METADATA)}
461
+
462
+ @app.get("/")
463
+ async def root():
464
+ return {
465
+ "message": "ChatFed Document Ingestion API",
466
+ "endpoints": {
467
+ "health": "/health",
468
+ "ingest": "/ingest",
469
+ "context": "/context/{doc_id}",
470
+ "documents": "/documents"
471
+ }
472
+ }
473
+
474
+ @app.post("/ingest")
475
+ async def ingest_endpoint(file: UploadFile = File(...)):
476
+ """Ingest a document file"""
477
+ try:
478
+ file_content = await file.read()
479
+ result = process_document(file_content, file.filename)
480
+ return result
481
+ except Exception as e:
482
+ raise HTTPException(status_code=500, detail=str(e))
483
+
484
+ @app.get("/context/{doc_id}")
485
+ async def get_context_endpoint(doc_id: str, max_chunks: int = 10):
486
+ """Get context for a specific document"""
487
+ try:
488
+ context = get_document_context(doc_id, max_chunks)
489
+ return {
490
+ "doc_id": doc_id,
491
+ "context": context,
492
+ "metadata": DOCUMENT_METADATA.get(doc_id, {})
493
+ }
494
+ except Exception as e:
495
+ raise HTTPException(status_code=404, detail=str(e))
496
+
497
+ @app.get("/documents")
498
+ async def list_documents_endpoint():
499
+ """List all processed documents"""
500
+ return {
501
+ "documents": list(DOCUMENT_METADATA.keys()),
502
+ "metadata": DOCUMENT_METADATA
503
+ }
504
+
505
+ # Add a simple API endpoint for the orchestrator to call
506
+ @app.post("/context")
507
+ async def get_context_simple(doc_id: str, max_chunks: int = 10):
508
+ """Simple context endpoint for orchestrator integration"""
509
+ try:
510
+ context = get_document_context(doc_id, max_chunks)
511
+ return {"context": context}
512
+ except Exception as e:
513
+ raise HTTPException(status_code=404, detail=str(e))
514
+
515
+ if __name__ == "__main__":
516
+ # Create and launch Gradio interface
517
+ demo = create_gradio_interface()
518
+
519
+ # Run both FastAPI and Gradio
520
+ import threading
521
+
522
+ def run_gradio():
523
+ demo.launch(
524
+ server_name="0.0.0.0",
525
+ server_port=7860,
526
+ show_error=True,
527
+ share=False,
528
+ quiet=True
529
+ )
530
+
531
+ def run_fastapi():
532
+ uvicorn.run(app, host="0.0.0.0", port=7863, log_level="info")
533
+
534
+ # Start Gradio in main thread
535
+ gradio_thread = threading.Thread(target=run_gradio, daemon=True)
536
+ gradio_thread.start()
537
+
538
+ # Start FastAPI in background
539
+ fastapi_thread = threading.Thread(target=run_fastapi, daemon=True)
540
+ fastapi_thread.start()
541
+
542
+ # Keep main thread alive
543
+ try:
544
+ gradio_thread.join()
545
+ except KeyboardInterrupt:
546
+ logger.info("Shutting down...")
app/utils.py ADDED
File without changes
params.cfg ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ gradio==4.44.0
4
+ pydantic==2.5.2
5
+ python-multipart>=0.0.9
6
+
7
+ # Document processing
8
+ PyPDF2==3.0.1
9
+ python-docx==1.1.0
10
+
11
+ # Utilities
12
+ python-dotenv==1.0.0
13
+
14
+