alexkueck commited on
Commit
cddacd5
1 Parent(s): 48fe730

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +29 -0
utils.py CHANGED
@@ -443,6 +443,35 @@ def extract_document_info(documents):
443
  }
444
  extracted_info.append(info)
445
  return extracted_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
 
448
 
 
443
  }
444
  extracted_info.append(info)
445
  return extracted_info
446
+
447
+
448
+
449
+ def extract_document_info(documents):
450
+ extracted_info = []
451
+ for doc in documents:
452
+ # Extract the filename from the path to use as the title
453
+ filename = os.path.basename(doc.metadata.get("path", ""))
454
+ title = filename if filename else "Keine Überschrift"
455
+
456
+ # Determine the document type and adjust the path accordingly
457
+ doc_path = doc.metadata.get("path", "")
458
+ if doc_path.endswith('.pdf'):
459
+ download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/pdf/{title}?token=hf_token"
460
+ elif doc_path.endswith('.docx'):
461
+ download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/word/{title}?token=hf_token"
462
+ else:
463
+ download_link = doc_path
464
+
465
+ info = {
466
+ 'content': doc.page_content,
467
+ 'metadata': doc.metadata,
468
+ 'titel': title,
469
+ 'seite': doc.metadata.get("page", "Unbekannte Seite"),
470
+ 'pfad': doc_path,
471
+ 'download_link': download_link
472
+ }
473
+ extracted_info.append(info)
474
+ return extracted_info
475
 
476
 
477