Lucas ARRIESSE commited on
Commit
4edd44f
·
1 Parent(s): 7e24a88

Add debug statements to narrow issue

Browse files
Files changed (1) hide show
  1. api/docs.py +11 -0
api/docs.py CHANGED
@@ -125,21 +125,32 @@ async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> l
125
  if ext == ".doc":
126
  logging.debug(f"Converting {filename} .doc --> .docx")
127
  docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
 
 
128
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
129
  final_text = extracted_data.content
 
130
  elif ext == ".docx":
131
  # Applying doc revisions to docx files (especially for pCR / draftCR files)
132
  logging.debug(f"Updating .docx revisions for {filename}.")
133
  applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
 
 
134
  extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
135
  final_text = extracted_data.content
 
136
  elif ext == ".ppt":
137
  logging.debug(f"Converting {filename} .ppt --> .pptx")
138
  docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
 
 
139
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
140
  final_text = extracted_data.content
 
141
  else:
142
  if ext in FORMAT_MIME_TYPES: # check if file extension is supported
 
 
143
  extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
144
  final_text = extracted_data.content
145
  else:
 
125
  if ext == ".doc":
126
  logging.debug(f"Converting {filename} .doc --> .docx")
127
  docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
128
+ logging.debug(
129
+ f"Extracting content for filename: {filename}, ext: {ext} with converted doc")
130
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
131
  final_text = extracted_data.content
132
+ logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
133
  elif ext == ".docx":
134
  # Applying doc revisions to docx files (especially for pCR / draftCR files)
135
  logging.debug(f"Updating .docx revisions for {filename}.")
136
  applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
137
+ logging.debug(
138
+ f"Extracting content for filename: {filename}, ext: {ext} with converted docx")
139
  extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
140
  final_text = extracted_data.content
141
+ logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
142
  elif ext == ".ppt":
143
  logging.debug(f"Converting {filename} .ppt --> .pptx")
144
  docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
145
+ logging.debug(
146
+ f"Extracting content for filename: {filename}, ext: {ext} with converted ppt")
147
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
148
  final_text = extracted_data.content
149
+ logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
150
  else:
151
  if ext in FORMAT_MIME_TYPES: # check if file extension is supported
152
+ logging.debug(
153
+ f"Extracting content for filename: {filename}, ext: {ext}")
154
  extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
155
  final_text = extracted_data.content
156
  else: