Spaces:

awacke1
/

AzureCosmosDBUI

Running

App Files Files Community

awacke1 commited on Feb 23

Commit

4580f0a

verified ·

1 Parent(s): 15c1377

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -11

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import io
 import requests
 import numpy as np
 from urllib.parse import quote
-import PyPDF2  # For PDF text extraction
 # =============================================================================
 # ───────────── EXTERNAL HELP LINKS ─────────────
@@ -117,8 +117,10 @@ def preprocess_text(text):
     return text.strip()
 def sanitize_json_text(text):
-    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', text)
-    return text.strip()
 def extract_pdf_text(pdf_file):
     pdf_reader = PyPDF2.PdfReader(pdf_file)
@@ -163,7 +165,7 @@ def delete_record(container, record):
     try:
         doc_id = record["id"]
         partition_key_value = record.get("pk", doc_id)
-        st.write(f"Deleting {doc_id} with partition key {partition_key_value}")
         container.delete_item(item=doc_id, partition_key=partition_key_value)
         return True, f"Record {doc_id} deleted. 🗑️"
     except exceptions.CosmosResourceNotFoundError:
@@ -392,12 +394,21 @@ def edit_all_documents(container, search_keyword=None):
             with col_save:
                 if st.button("💾 Save", key=f"save_{doc['id']}"):
                     try:
-                        cleaned_content = sanitize_json_text(edited_content)
-                        updated_doc = json.loads(cleaned_content)
-                        updated_doc['id'] = doc['id']
-                        updated_doc['pk'] = doc.get('pk', doc['id'])
-                        for field in ['_ts', '_rid', '_self', '_etag', '_attachments']:
-                            updated_doc.pop(field, None)
                         success, message = update_record(container, updated_doc)
                         if success:
                             st.success(f"Saved {doc['id']}")
@@ -446,7 +457,7 @@ def new_item_from_pdf(container, pdf_file):
         "id": new_id,
         "pk": new_id,
         "name": pdf_file.name,
-        "content": pdf_text[:1000],  # Truncate for brevity, full text in pdf_data
         "timestamp": datetime.now().isoformat(),
         "type": "pdf_document",
         "pdf_data": pdf_base64

 import requests
 import numpy as np
 from urllib.parse import quote
+import PyPDF2
 # =============================================================================
 # ───────────── EXTERNAL HELP LINKS ─────────────
     return text.strip()
 def sanitize_json_text(text):
+    # Escape special characters for JSON compatibility
+    text = text.encode().decode('unicode_escape')  # Handle existing escapes
+    text = json.dumps(text)[1:-1]  # Wrap in quotes and remove them to escape properly
+    return text
 def extract_pdf_text(pdf_file):
     pdf_reader = PyPDF2.PdfReader(pdf_file)
     try:
         doc_id = record["id"]
         partition_key_value = record.get("pk", doc_id)
+        st.write(f"Attempting to delete {doc_id} with partition key {partition_key_value}")
         container.delete_item(item=doc_id, partition_key=partition_key_value)
         return True, f"Record {doc_id} deleted. 🗑️"
     except exceptions.CosmosResourceNotFoundError:
             with col_save:
                 if st.button("💾 Save", key=f"save_{doc['id']}"):
                     try:
+                        # Handle raw text pasted into editor
+                        cleaned_content = edited_content.strip()
+                        if not (cleaned_content.startswith('{') and cleaned_content.endswith('}')):
+                            # If not JSON, assume raw text and wrap it
+                            sanitized_text = sanitize_json_text(cleaned_content)
+                            updated_doc = doc.copy()
+                            updated_doc['content'] = sanitized_text
+                        else:
+                            # If JSON, parse it directly
+                            updated_doc = json.loads(cleaned_content)
+                            updated_doc['id'] = doc['id']
+                            updated_doc['pk'] = doc.get('pk', doc['id'])
+                            for field in ['_ts', '_rid', '_self', '_etag', '_attachments']:
+                                updated_doc.pop(field, None)
                         success, message = update_record(container, updated_doc)
                         if success:
                             st.success(f"Saved {doc['id']}")
         "id": new_id,
         "pk": new_id,
         "name": pdf_file.name,
+        "content": pdf_text[:1000],
         "timestamp": datetime.now().isoformat(),
         "type": "pdf_document",
         "pdf_data": pdf_base64