mtyrrell commited on
Commit
efd3c5f
·
1 Parent(s): b6ee133

ingestor context handling

Browse files
Files changed (1) hide show
  1. utils/sources.py +25 -8
utils/sources.py CHANGED
@@ -122,15 +122,32 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
122
  if isinstance(result, str):
123
  result = ast.literal_eval(result)
124
 
 
 
125
  metadata = result.get('answer_metadata', {})
126
- doc_info = {
127
- 'answer': result.get('answer', ''),
128
- 'filename': metadata.get('filename', 'Unknown'),
129
- 'page': metadata.get('page', 'Unknown'),
130
- 'year': metadata.get('year', 'Unknown'),
131
- 'source': metadata.get('source', 'Unknown'),
132
- 'document_id': metadata.get('_id', 'Unknown')
133
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  processed_results.append(doc_info)
135
 
136
  # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
 
122
  if isinstance(result, str):
123
  result = ast.literal_eval(result)
124
 
125
+ # Handle both ingested files (metadata at top level) and retrieved documents (metadata in answer_metadata)
126
+ # Check if metadata is nested in 'answer_metadata' (retrieved documents)
127
  metadata = result.get('answer_metadata', {})
128
+
129
+ # If answer_metadata is empty or missing, check top level (ingested files)
130
+ if not metadata or all(v is None or v == 'Unknown' for v in metadata.values()):
131
+ # For ingested files, metadata is at the top level
132
+ doc_info = {
133
+ 'answer': result.get('answer', result.get('content', '')),
134
+ 'filename': result.get('filename', 'Unknown'),
135
+ 'page': result.get('page', 'Unknown'),
136
+ 'year': result.get('year', 'Unknown'),
137
+ 'source': result.get('source', 'Unknown'),
138
+ 'document_id': result.get('_id', result.get('document_id', 'Unknown'))
139
+ }
140
+ else:
141
+ # For retrieved documents, use nested metadata
142
+ doc_info = {
143
+ 'answer': result.get('answer', ''),
144
+ 'filename': metadata.get('filename', 'Unknown'),
145
+ 'page': metadata.get('page', 'Unknown'),
146
+ 'year': metadata.get('year', 'Unknown'),
147
+ 'source': metadata.get('source', 'Unknown'),
148
+ 'document_id': metadata.get('_id', 'Unknown')
149
+ }
150
+
151
  processed_results.append(doc_info)
152
 
153
  # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]