Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
4edd44f
1
Parent(s):
7e24a88
Add debug statements to narrow issue
Browse files- api/docs.py +11 -0
api/docs.py
CHANGED
@@ -125,21 +125,32 @@ async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> l
|
|
125 |
if ext == ".doc":
|
126 |
logging.debug(f"Converting {filename} .doc --> .docx")
|
127 |
docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
|
|
|
|
|
128 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
129 |
final_text = extracted_data.content
|
|
|
130 |
elif ext == ".docx":
|
131 |
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
132 |
logging.debug(f"Updating .docx revisions for {filename}.")
|
133 |
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
|
|
|
|
134 |
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
135 |
final_text = extracted_data.content
|
|
|
136 |
elif ext == ".ppt":
|
137 |
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
138 |
docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
|
|
|
|
|
139 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
140 |
final_text = extracted_data.content
|
|
|
141 |
else:
|
142 |
if ext in FORMAT_MIME_TYPES: # check if file extension is supported
|
|
|
|
|
143 |
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
144 |
final_text = extracted_data.content
|
145 |
else:
|
|
|
125 |
if ext == ".doc":
|
126 |
logging.debug(f"Converting {filename} .doc --> .docx")
|
127 |
docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
|
128 |
+
logging.debug(
|
129 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted doc")
|
130 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
131 |
final_text = extracted_data.content
|
132 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
133 |
elif ext == ".docx":
|
134 |
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
135 |
logging.debug(f"Updating .docx revisions for {filename}.")
|
136 |
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
137 |
+
logging.debug(
|
138 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted docx")
|
139 |
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
140 |
final_text = extracted_data.content
|
141 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
142 |
elif ext == ".ppt":
|
143 |
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
144 |
docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
|
145 |
+
logging.debug(
|
146 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted ppt")
|
147 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
148 |
final_text = extracted_data.content
|
149 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
150 |
else:
|
151 |
if ext in FORMAT_MIME_TYPES: # check if file extension is supported
|
152 |
+
logging.debug(
|
153 |
+
f"Extracting content for filename: {filename}, ext: {ext}")
|
154 |
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
155 |
final_text = extracted_data.content
|
156 |
else:
|