KingNish commited on
Commit
5a95216
·
verified ·
1 Parent(s): 55203fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -92,7 +92,7 @@ def extract_text_from_pptx(pptx_data, clean=True):
92
  text = clean_text(text)
93
  return text, len(text)
94
 
95
- def read_document(file_path, clean=True):
96
  with open(file_path, "rb") as f:
97
  file_content = f.read()
98
 
@@ -159,8 +159,8 @@ def read_document(file_path, clean=True):
159
  soup = BeautifulSoup(file_content, 'html.parser')
160
  structured_data = {
161
  "Texts": extract_texts(soup),
162
- "Links": extract_links(soup, ""),
163
- "Images": extract_images(soup, "")
164
  }
165
  return format_detailed_output(structured_data), 0
166
  except Exception as e:
@@ -204,7 +204,7 @@ def download_and_process_file(url, clean=True):
204
  if kind and kind.mime.startswith('image/'):
205
  return f"![]({url})", 0 # Return markdown image syntax if it's an image
206
  else:
207
- return read_document(temp_filename, clean) # Otherwise, process as a document
208
 
209
  except requests.exceptions.MissingSchema:
210
  return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
 
92
  text = clean_text(text)
93
  return text, len(text)
94
 
95
+ def read_document(file_path, clean=True, url=""):
96
  with open(file_path, "rb") as f:
97
  file_content = f.read()
98
 
 
159
  soup = BeautifulSoup(file_content, 'html.parser')
160
  structured_data = {
161
  "Texts": extract_texts(soup),
162
+ "Links": extract_links(soup, url),
163
+ "Images": extract_images(soup, url)
164
  }
165
  return format_detailed_output(structured_data), 0
166
  except Exception as e:
 
204
  if kind and kind.mime.startswith('image/'):
205
  return f"![]({url})", 0 # Return markdown image syntax if it's an image
206
  else:
207
+ return read_document(temp_filename, clean, url) # Otherwise, process as a document
208
 
209
  except requests.exceptions.MissingSchema:
210
  return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0