awacke1 commited on
Commit
4580f0a
Β·
verified Β·
1 Parent(s): 15c1377

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -11
app.py CHANGED
@@ -27,7 +27,7 @@ import io
27
  import requests
28
  import numpy as np
29
  from urllib.parse import quote
30
- import PyPDF2 # For PDF text extraction
31
 
32
  # =============================================================================
33
  # ───────────── EXTERNAL HELP LINKS ─────────────
@@ -117,8 +117,10 @@ def preprocess_text(text):
117
  return text.strip()
118
 
119
  def sanitize_json_text(text):
120
- text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', text)
121
- return text.strip()
 
 
122
 
123
  def extract_pdf_text(pdf_file):
124
  pdf_reader = PyPDF2.PdfReader(pdf_file)
@@ -163,7 +165,7 @@ def delete_record(container, record):
163
  try:
164
  doc_id = record["id"]
165
  partition_key_value = record.get("pk", doc_id)
166
- st.write(f"Deleting {doc_id} with partition key {partition_key_value}")
167
  container.delete_item(item=doc_id, partition_key=partition_key_value)
168
  return True, f"Record {doc_id} deleted. πŸ—‘οΈ"
169
  except exceptions.CosmosResourceNotFoundError:
@@ -392,12 +394,21 @@ def edit_all_documents(container, search_keyword=None):
392
  with col_save:
393
  if st.button("πŸ’Ύ Save", key=f"save_{doc['id']}"):
394
  try:
395
- cleaned_content = sanitize_json_text(edited_content)
396
- updated_doc = json.loads(cleaned_content)
397
- updated_doc['id'] = doc['id']
398
- updated_doc['pk'] = doc.get('pk', doc['id'])
399
- for field in ['_ts', '_rid', '_self', '_etag', '_attachments']:
400
- updated_doc.pop(field, None)
 
 
 
 
 
 
 
 
 
401
  success, message = update_record(container, updated_doc)
402
  if success:
403
  st.success(f"Saved {doc['id']}")
@@ -446,7 +457,7 @@ def new_item_from_pdf(container, pdf_file):
446
  "id": new_id,
447
  "pk": new_id,
448
  "name": pdf_file.name,
449
- "content": pdf_text[:1000], # Truncate for brevity, full text in pdf_data
450
  "timestamp": datetime.now().isoformat(),
451
  "type": "pdf_document",
452
  "pdf_data": pdf_base64
 
27
  import requests
28
  import numpy as np
29
  from urllib.parse import quote
30
+ import PyPDF2
31
 
32
  # =============================================================================
33
  # ───────────── EXTERNAL HELP LINKS ─────────────
 
117
  return text.strip()
118
 
119
  def sanitize_json_text(text):
120
+ # Escape special characters for JSON compatibility
121
+ text = text.encode().decode('unicode_escape') # Handle existing escapes
122
+ text = json.dumps(text)[1:-1] # Wrap in quotes and remove them to escape properly
123
+ return text
124
 
125
  def extract_pdf_text(pdf_file):
126
  pdf_reader = PyPDF2.PdfReader(pdf_file)
 
165
  try:
166
  doc_id = record["id"]
167
  partition_key_value = record.get("pk", doc_id)
168
+ st.write(f"Attempting to delete {doc_id} with partition key {partition_key_value}")
169
  container.delete_item(item=doc_id, partition_key=partition_key_value)
170
  return True, f"Record {doc_id} deleted. πŸ—‘οΈ"
171
  except exceptions.CosmosResourceNotFoundError:
 
394
  with col_save:
395
  if st.button("πŸ’Ύ Save", key=f"save_{doc['id']}"):
396
  try:
397
+ # Handle raw text pasted into editor
398
+ cleaned_content = edited_content.strip()
399
+ if not (cleaned_content.startswith('{') and cleaned_content.endswith('}')):
400
+ # If not JSON, assume raw text and wrap it
401
+ sanitized_text = sanitize_json_text(cleaned_content)
402
+ updated_doc = doc.copy()
403
+ updated_doc['content'] = sanitized_text
404
+ else:
405
+ # If JSON, parse it directly
406
+ updated_doc = json.loads(cleaned_content)
407
+ updated_doc['id'] = doc['id']
408
+ updated_doc['pk'] = doc.get('pk', doc['id'])
409
+ for field in ['_ts', '_rid', '_self', '_etag', '_attachments']:
410
+ updated_doc.pop(field, None)
411
+
412
  success, message = update_record(container, updated_doc)
413
  if success:
414
  st.success(f"Saved {doc['id']}")
 
457
  "id": new_id,
458
  "pk": new_id,
459
  "name": pdf_file.name,
460
+ "content": pdf_text[:1000],
461
  "timestamp": datetime.now().isoformat(),
462
  "type": "pdf_document",
463
  "pdf_data": pdf_base64