Zeggai commited on
Commit
0d9a678
·
verified ·
1 Parent(s): 1d6e6be

Delete document_uploader.py

Browse files
Files changed (1) hide show
  1. document_uploader.py +0 -200
document_uploader.py DELETED
@@ -1,200 +0,0 @@
1
- # --- Imports ---
2
- import os
3
- import re
4
- from pathlib import Path
5
- from global_settings import STORAGE_PATH, CACHE_FILE
6
- from logging_functions import log_action
7
-
8
- # LlamaIndex Core Imports
9
- from llama_index.core import SimpleDirectoryReader, Settings
10
- from llama_index.core.ingestion import IngestionPipeline, IngestionCache
11
- # Import the Unstructured Node Parser
12
- from llama_index.core.node_parser import UnstructuredElementNodeParser # For parsing PDFs directly
13
- from llama_index.core.extractors import SummaryExtractor # Optional
14
-
15
- # Import Embedding Model
16
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
17
-
18
- # Import LLM (Gemini) - Optional, only if SummaryExtractor runs
19
- from llama_index.llms.google_genai import GoogleGenAI
20
-
21
- # --- Function Definition ---
22
- def ingest_section_docs_unstructured(
23
- input_path=STORAGE_PATH,
24
- cache_path=CACHE_FILE,
25
- process_filename=None,
26
- use_summaries=False
27
- ):
28
- """
29
- Ingests one or more SECTION document files (PDFs) using SimpleDirectoryReader
30
- followed by UnstructuredElementNodeParser in the pipeline.
31
- Adds section metadata based on filenames.
32
-
33
- Args:
34
- input_path (str): Path to the directory containing section PDF documents.
35
- cache_path (str): Path to the ingestion cache file.
36
- process_filename (str, optional): If provided, only process the document with this filename. Defaults to None (process all).
37
- use_summaries (bool): Whether to include SummaryExtractor. Defaults to False.
38
-
39
- Returns:
40
- list: A list of processed BaseNode objects with section metadata, or empty list on failure.
41
- """
42
-
43
- # --- LLM & Embedding Configuration ---
44
- print("Configuring LLM (Gemini if needed) and Embedding models...")
45
- gemini_api_key = os.getenv("GOOGLE_API_KEY")
46
- if use_summaries:
47
- if not gemini_api_key:
48
- print("Warning: GOOGLE_API_KEY not set, but summaries requested. Disabling summaries.")
49
- Settings.llm = None
50
- use_summaries = False
51
- else:
52
- Settings.llm = GoogleGenAI(model_name="models/gemini-1.5-flash-latest", api_key=gemini_api_key)
53
- print(f"Gemini LLM configured: {Settings.llm.model}")
54
- else:
55
- Settings.llm = None
56
- print("LLM not configured as summaries are disabled.")
57
-
58
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
59
- print(f"Embedding Model: {Settings.embed_model.model_name}")
60
- # --- End Configuration ---
61
-
62
-
63
- # --- Load Data (Basic Text Extraction) ---
64
- print(f"Attempting to load documents from: {input_path}")
65
- # SimpleDirectoryReader will do basic PDF text extraction here.
66
- # We are NOT using LlamaParse in file_extractor anymore.
67
- reader_kwargs = {"filename_as_id": True, "required_exts": [".pdf"]}
68
- documents_to_process = []
69
-
70
- if process_filename:
71
- print(f"Attempting to load specific section file: {process_filename}")
72
- file_path = Path(input_path) / process_filename
73
- if not file_path.exists() or file_path.suffix.lower() != ".pdf":
74
- print(f"Error: Specified file '{process_filename}' not found or not a PDF in '{input_path}'.")
75
- return []
76
- # Load single PDF
77
- reader = SimpleDirectoryReader(input_files=[file_path], **reader_kwargs)
78
- else:
79
- print(f"Loading all PDF files from directory: {input_path}")
80
- reader = SimpleDirectoryReader(input_path, **reader_kwargs)
81
-
82
- try:
83
- # loaded_docs are basic Document objects with raw text extracted by the reader
84
- loaded_docs = reader.load_data(show_progress=True)
85
- print(f"Successfully loaded {len(loaded_docs)} documents (basic extraction).")
86
- documents_to_process = loaded_docs
87
- except Exception as e:
88
- print(f"Error loading documents: {e}")
89
- import traceback
90
- traceback.print_exc()
91
- return []
92
-
93
- if not documents_to_process:
94
- print("No documents loaded. Exiting ingestion.")
95
- return []
96
-
97
- # --- Add Section Metadata Based on Filename ---
98
- # Apply this to the initially loaded docs BEFORE the pipeline re-parses them
99
- print("Adding section metadata based on filenames...")
100
- docs_with_metadata = []
101
- filename_pattern = re.compile(r"^(\d+)\.\s+(.*)\.pdf$", re.IGNORECASE)
102
- for doc in documents_to_process:
103
- filename = doc.metadata.get('file_name', doc.id_) # Use id_ if filename missing
104
- section_id = "unknown"
105
- section_title = "unknown"
106
- match = filename_pattern.match(filename)
107
- if match:
108
- section_id = match.group(1).strip()
109
- section_title = match.group(2).strip()
110
- else:
111
- print(f"Warning: Filename '{filename}' did not match expected pattern 'Number. Title.pdf'")
112
-
113
- doc.metadata['section_id'] = section_id
114
- doc.metadata['section_title'] = section_title
115
- docs_with_metadata.append(doc)
116
- log_action(f"File '{filename}' (Section {section_id}) loaded.", action_type="LOAD")
117
- # --- End Metadata Addition ---
118
-
119
-
120
- # --- Caching Logic ---
121
- try:
122
- cache = IngestionCache.from_persist_path(cache_path)
123
- print("Cache file found. Running using cache...")
124
- except FileNotFoundError:
125
- cache = IngestionCache()
126
- print("No cache file found or error reading cache. Running without...")
127
-
128
- # --- Define the Ingestion Pipeline (Unstructured Parser FIRST) ---
129
- print("Defining ingestion pipeline (Unstructured Parser, Embedding)...")
130
-
131
- # 1. UnstructuredElementNodeParser will take the raw Documents and re-parse them
132
- # using the 'unstructured' library for better layout/element detection.
133
- node_parser = UnstructuredElementNodeParser()
134
-
135
- # 2. (Optional) Summary Extractor
136
- summary_extractor = SummaryExtractor(summaries=['self']) if use_summaries and Settings.llm else None
137
-
138
- # 3. Embedding Model (using Settings)
139
- embed_model = Settings.embed_model
140
-
141
- transformations = [node_parser] # Unstructured parser goes first!
142
- if summary_extractor:
143
- transformations.append(summary_extractor)
144
- transformations.append(embed_model)
145
-
146
- pipeline = IngestionPipeline(
147
- transformations=transformations,
148
- cache=cache
149
- )
150
- print(f"Pipeline transformations: {[type(t).__name__ for t in pipeline.transformations]}")
151
-
152
- # --- Run Pipeline ---
153
- print("Running ingestion pipeline (Unstructured Parsing, Embedding)...")
154
- # Pass the initial Documents (with added metadata) to the pipeline
155
- # UnstructuredElementNodeParser will process them first.
156
- final_nodes = pipeline.run(documents=docs_with_metadata, show_progress=True)
157
- print(f"Ingestion pipeline complete. Processed/Generated {len(final_nodes)} final nodes.")
158
-
159
- # --- Node Inspection ---
160
- if final_nodes:
161
- print("\n--- Inspecting Final Nodes (Post-Pipeline) ---")
162
- num_nodes_to_inspect = min(len(final_nodes), 3)
163
- for i in range(num_nodes_to_inspect):
164
- node_to_inspect = final_nodes[i]
165
- print(f"\n--- Node {i} (ID: {node_to_inspect.node_id}) ---")
166
- print("Metadata:")
167
- print(node_to_inspect.metadata) # Verify section_id etc.
168
- print("\nContent (first 500 chars):")
169
- print(node_to_inspect.text[:500] + "...")
170
- print("-" * 20)
171
-
172
- # --- Persist Cache ---
173
- print(f"Persisting cache to {cache_path}...")
174
- pipeline.cache.persist(cache_path)
175
- print("Cache persisted.")
176
-
177
- return final_nodes
178
-
179
- # --- Script Execution ---
180
- if __name__ == "__main__":
181
- print("Starting Section Document Ingestion using Unstructured...")
182
- # 1. Place section PDFs in STORAGE_PATH.
183
- # 2. Ensure unstructured dependencies are installed (see above).
184
- # 3. Set GOOGLE_API_KEY if using summaries.
185
-
186
- generate_summaries = False # Keep False to avoid LLM calls initially
187
- process_this_file = None # Set to filename like "2. REPERES SUR LES MALADIES....pdf" or None
188
-
189
- if process_this_file:
190
- print(f"Processing single file: {process_this_file}")
191
- else:
192
- print(f"Processing all PDF files found in: {STORAGE_PATH}")
193
-
194
- nodes_output = ingest_section_docs_unstructured(
195
- process_filename=process_this_file,
196
- use_summaries=generate_summaries
197
- )
198
-
199
- print(f"\nIngestion process finished. {len(nodes_output) if nodes_output else 0} nodes processed.")
200
- # ... rest of main block ...