sam2ai commited on
Commit
ccf039b
1 Parent(s): b18c1e9

Synced repo using 'sync_with_huggingface' Github Action

Browse files
pages/1_URLs.py CHANGED
@@ -25,13 +25,17 @@ def check_sitemap(url):
25
  # Check for sitemap-specific elements
26
  if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
27
  return True
28
- except etree.XMLSyntaxError:
29
- pass
30
-
31
  # Additional conditions for identifying sitemaps
32
- if 'sitemap' in url.lower():
33
- # Perform additional checks specific to the website's structure or naming conventions
34
- return True
 
 
 
 
 
35
 
36
  return False
37
 
@@ -152,29 +156,35 @@ def process_urls(sitemap_urls , category):
152
  # function to process for a single URL
153
  def run_function(url , category):
154
  extracted_txt = ""
155
- # Check if the user has provided a URL
156
- if url:
157
- if valid_url(url):
158
- temp_para = extract_data_from_url_(url)
159
- temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
160
- extracted_txt = temp_txt_data
161
- extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
162
 
163
- # displaying extracted txt for single URL
164
- st.text_area("Extracted Text", value=extracted_txt, height=200)
165
-
166
-
167
- extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
168
 
169
- # return extract status, and the data extracted
170
- return True, extracted_txt, extracted_jsonl
 
 
171
  else:
 
 
172
  return False, None, None
173
- else:
174
- st.error("Error: An error occurred while fetching content.")
175
- # return extract status, and the data extracted
176
- return False, None, None
177
 
 
178
 
179
 
180
  def main():
@@ -314,23 +324,26 @@ def main():
314
  save_as_json = st.checkbox("jsonl", value=False)
315
 
316
  if not save_as_txt and not save_as_json:
317
- if st.button("Clear"):
318
- st.session_state.button_enter_url = False
319
- st.session_state.Initial = True
320
- st.session_state.extracted_url = False
321
- if 'sitemap_data_text' in st.session_state:
322
- del st.session_state['sitemap_data_text']
323
- if 'sitemap_data_jsonl' in st.session_state:
324
- del st.session_state['sitemap_data_jsonl']
325
- st.session_state.button_enter_url = False
326
- st.experimental_rerun()
327
- else:
 
 
 
 
328
  col1, col2 = st.columns([0.5, 0.5])
329
  # save column
330
  with col1:
331
 
332
  if is_a_sitemap:
333
-
334
  if save_as_txt:
335
  if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
336
  saved_successfully = True
@@ -357,7 +370,34 @@ def main():
357
  del st.session_state['sitemap_data_jsonl']
358
  st.session_state.button_enter_url = False
359
  st.experimental_rerun()
360
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  if saved_successfully:
362
  # Confirmation message
363
  st.success(f"File saved successfully.")
@@ -365,10 +405,14 @@ def main():
365
  st.write("#")
366
  else:
367
  st.warning("Data not extracted")
368
- if st.button("clear"):
369
- st.session_state.button_enter_url = False
370
- st.session_state.extracted_url = False
371
- st.experimental_rerun()
 
 
 
 
372
  st.write("#")
373
  st.write("#")
374
 
 
25
  # Check for sitemap-specific elements
26
  if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
27
  return True
28
+ except Exception as e:
29
+ st.error("Invalid sitemap!!")
 
30
  # Additional conditions for identifying sitemaps
31
+ elif 'sitemap' in url.lower():
32
+ try:
33
+ response = requests.get(url)
34
+ # Perform additional checks specific to the website's structure or naming conventions
35
+ return True
36
+ except Exception as e:
37
+ # st.error("Invalid sitemap!!")
38
+ pass
39
 
40
  return False
41
 
 
156
  # function to process for a single URL
157
  def run_function(url , category):
158
  extracted_txt = ""
 
 
 
 
 
 
 
159
 
160
+ try:
161
+ response = requests.get(url)
162
+ # Check if the user has provided a URL
163
+ if url:
164
+ if valid_url(url):
165
+ temp_para = extract_data_from_url_(url)
166
+ temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
167
+ extracted_txt = temp_txt_data
168
+ extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
169
+
170
+ # displaying extracted txt for single URL
171
+ st.text_area("Extracted Text", value=extracted_txt, height=200)
172
+
173
+
174
+ extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
175
 
176
+ # return extract status, and the data extracted
177
+ return True, extracted_txt, extracted_jsonl
178
+ else:
179
+ return False, None, None
180
  else:
181
+ st.error("Error: An error occurred while fetching content.")
182
+ # return extract status, and the data extracted
183
  return False, None, None
184
+ except Exception as e:
185
+ st.error("Invalid URL")
 
 
186
 
187
+ return False, None, None
188
 
189
 
190
  def main():
 
324
  save_as_json = st.checkbox("jsonl", value=False)
325
 
326
  if not save_as_txt and not save_as_json:
327
+ clear_c1, clear_c2 = st.columns([0.5,0.5])
328
+ with clear_c1:
329
+ if st.button("Clear"):
330
+ st.session_state.button_enter_url = False
331
+ st.session_state.Initial = True
332
+ st.session_state.extracted_url = False
333
+ if 'sitemap_data_text' in st.session_state:
334
+ del st.session_state['sitemap_data_text']
335
+ if 'sitemap_data_jsonl' in st.session_state:
336
+ del st.session_state['sitemap_data_jsonl']
337
+ st.session_state.button_enter_url = False
338
+ st.experimental_rerun()
339
+ with clear_c2:
340
+ print()
341
+ elif (save_as_txt and not save_as_json) or (save_as_json and not save_as_txt):
342
  col1, col2 = st.columns([0.5, 0.5])
343
  # save column
344
  with col1:
345
 
346
  if is_a_sitemap:
 
347
  if save_as_txt:
348
  if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
349
  saved_successfully = True
 
370
  del st.session_state['sitemap_data_jsonl']
371
  st.session_state.button_enter_url = False
372
  st.experimental_rerun()
373
+ elif save_as_txt and save_as_json:
374
+ savetxt_c1,saveJson_c2,clear_c3 = st.columns([0.25,0.25,0.5])
375
+ with savetxt_c1:
376
+ if is_a_sitemap:
377
+ if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
378
+ saved_successfully = True
379
+ else:
380
+ if st.download_button(label="Save as txt",data=data_txt ):
381
+ saved_successfully = True
382
+ with saveJson_c2:
383
+ if is_a_sitemap:
384
+ if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
385
+ saved_successfully = True
386
+ else:
387
+ if save_as_json:
388
+ if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
389
+ saved_successfully = True
390
+ with clear_c3:
391
+ if st.button("Clear"):
392
+ st.session_state.button_enter_url = False
393
+ st.session_state.Initial = True
394
+ st.session_state.extracted_url = False
395
+ if 'sitemap_data_text' in st.session_state:
396
+ del st.session_state['sitemap_data_text']
397
+ if 'sitemap_data_jsonl' in st.session_state:
398
+ del st.session_state['sitemap_data_jsonl']
399
+ st.session_state.button_enter_url = False
400
+ st.experimental_rerun()
401
  if saved_successfully:
402
  # Confirmation message
403
  st.success(f"File saved successfully.")
 
405
  st.write("#")
406
  else:
407
  st.warning("Data not extracted")
408
+ notextracted_c1,notextracted_c2 = st.columns([0.5,0.5])
409
+ with notextracted_c1:
410
+ if st.button("clear"):
411
+ st.session_state.button_enter_url = False
412
+ st.session_state.extracted_url = False
413
+ st.experimental_rerun()
414
+ with notextracted_c2:
415
+ print()
416
  st.write("#")
417
  st.write("#")
418
 
styles.css CHANGED
@@ -25,6 +25,9 @@
25
  margin: 0px;
26
  }
27
 
 
 
 
28
  /* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
29
  background-color: #3498db;
30
  padding: 5px;
 
25
  margin: 0px;
26
  }
27
 
28
+ button.css-1oz26th.edgvbvh10 {
29
+ width: 100%;
30
+ }
31
  /* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
32
  background-color: #3498db;
33
  padding: 5px;
utils/__pycache__/footer.cpython-38.pyc CHANGED
Binary files a/utils/__pycache__/footer.cpython-38.pyc and b/utils/__pycache__/footer.cpython-38.pyc differ