Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -39,10 +39,37 @@ selected_model_name = models[0] # Default to the first model in the list
|
|
| 39 |
|
| 40 |
# Initialize the parser
|
| 41 |
parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Embedding model and index initialization (to be populated by uploaded files)
|
| 45 |
-
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
|
|
|
| 46 |
|
| 47 |
# Global variable to store documents loaded from user-uploaded files
|
| 48 |
vector_index = None
|
|
|
|
| 39 |
|
| 40 |
# Initialize the parser
|
| 41 |
parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
|
| 42 |
+
# Define file extractor with various common extensions
|
| 43 |
+
file_extractor = {
|
| 44 |
+
'.pdf': parser, # PDF documents
|
| 45 |
+
'.docx': parser, # Microsoft Word documents
|
| 46 |
+
'.doc': parser, # Older Microsoft Word documents
|
| 47 |
+
'.txt': parser, # Plain text files
|
| 48 |
+
'.csv': parser, # Comma-separated values files
|
| 49 |
+
'.xlsx': parser, # Microsoft Excel files (requires additional processing for tables)
|
| 50 |
+
'.pptx': parser, # Microsoft PowerPoint files (for slides)
|
| 51 |
+
'.html': parser, # HTML files (web pages)
|
| 52 |
+
# '.rtf': parser, # Rich Text Format files
|
| 53 |
+
# '.odt': parser, # OpenDocument Text files
|
| 54 |
+
# '.epub': parser, # ePub files (e-books)
|
| 55 |
+
|
| 56 |
+
# Image files for OCR processing
|
| 57 |
+
'.jpg': parser, # JPEG images
|
| 58 |
+
'.jpeg': parser, # JPEG images
|
| 59 |
+
'.png': parser, # PNG images
|
| 60 |
+
# '.bmp': parser, # Bitmap images
|
| 61 |
+
# '.tiff': parser, # TIFF images
|
| 62 |
+
# '.tif': parser, # TIFF images (alternative extension)
|
| 63 |
+
# '.gif': parser, # GIF images (can contain text)
|
| 64 |
+
|
| 65 |
+
# Scanned documents in image formats
|
| 66 |
+
'.webp': parser, # WebP images
|
| 67 |
+
'.svg': parser, # SVG files (vector format, may contain embedded text)
|
| 68 |
+
}
|
| 69 |
|
| 70 |
# Embedding model and index initialization (to be populated by uploaded files)
|
| 71 |
+
# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
| 72 |
+
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 73 |
|
| 74 |
# Global variable to store documents loaded from user-uploaded files
|
| 75 |
vector_index = None
|