Spaces:

Sarath0x8f
/

Document-QA-bot

Running

App Files Files Community

Sarath0x8f commited on Nov 9, 2024

Commit

90a2d71

verified ·

1 Parent(s): 3e21c23

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -2

app.py CHANGED Viewed

@@ -39,10 +39,37 @@ selected_model_name = models[0]  # Default to the first model in the list
 # Initialize the parser
 parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
-file_extractor = {'.pdf': parser, '.docx': parser, '.doc': parser}
 # Embedding model and index initialization (to be populated by uploaded files)
-embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 # Global variable to store documents loaded from user-uploaded files
 vector_index = None

 # Initialize the parser
 parser = LlamaParse(api_key=os.getenv("LLAMA_INDEX_API"), result_type='markdown')
+# Define file extractor with various common extensions
+file_extractor = {
+    '.pdf': parser,  # PDF documents
+    '.docx': parser,  # Microsoft Word documents
+    '.doc': parser,  # Older Microsoft Word documents
+    '.txt': parser,  # Plain text files
+    '.csv': parser,  # Comma-separated values files
+    '.xlsx': parser,  # Microsoft Excel files (requires additional processing for tables)
+    '.pptx': parser,  # Microsoft PowerPoint files (for slides)
+    '.html': parser,  # HTML files (web pages)
+    # '.rtf': parser,  # Rich Text Format files
+    # '.odt': parser,  # OpenDocument Text files
+    # '.epub': parser,  # ePub files (e-books)
+    # Image files for OCR processing
+    '.jpg': parser,  # JPEG images
+    '.jpeg': parser,  # JPEG images
+    '.png': parser,  # PNG images
+    # '.bmp': parser,  # Bitmap images
+    # '.tiff': parser,  # TIFF images
+    # '.tif': parser,  # TIFF images (alternative extension)
+    # '.gif': parser,  # GIF images (can contain text)
+    # Scanned documents in image formats
+    '.webp': parser,  # WebP images
+    '.svg': parser,  # SVG files (vector format, may contain embedded text)
+}
 # Embedding model and index initialization (to be populated by uploaded files)
+# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
+embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
 # Global variable to store documents loaded from user-uploaded files
 vector_index = None