sam2ai commited on
Commit
b18c1e9
·
1 Parent(s): 26998f0

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. pages/1_URLs.py +16 -7
pages/1_URLs.py CHANGED
@@ -125,7 +125,7 @@ sitemap_data = ""
125
 
126
 
127
  # function to process a batch of URLS in sitemaps
128
- def process_urls(sitemap_urls):
129
 
130
  extracted_txt = ""
131
  extracted_jsonl_list= []
@@ -135,7 +135,7 @@ def process_urls(sitemap_urls):
135
  # using justext to extract data
136
  temp_para = extract_data_from_url_(url)
137
  temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
138
- temp_jsonl_data = {"text": temp_para, "url": url}
139
  extracted_txt += temp_txt_data
140
  extracted_jsonl_list.append(temp_jsonl_data)
141
  else:
@@ -150,7 +150,7 @@ def process_urls(sitemap_urls):
150
 
151
 
152
  # function to process for a single URL
153
- def run_function(url):
154
  extracted_txt = ""
155
  # Check if the user has provided a URL
156
  if url:
@@ -158,7 +158,7 @@ def run_function(url):
158
  temp_para = extract_data_from_url_(url)
159
  temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
160
  extracted_txt = temp_txt_data
161
- extracted_jsonl = {"text": str(temp_para), "url":str(url)}
162
 
163
  # displaying extracted txt for single URL
164
  st.text_area("Extracted Text", value=extracted_txt, height=200)
@@ -180,6 +180,10 @@ def run_function(url):
180
  def main():
181
  st.subheader("Extract Data from URLs")
182
 
 
 
 
 
183
  # dividing the body section into 2 columns for url and enter button
184
  col1, col2 = st.columns([0.7,0.3])
185
 
@@ -199,6 +203,8 @@ def main():
199
  st.session_state.extracted_url = False
200
  data = ""
201
 
 
 
202
  # the enter button
203
  if st.session_state.button_enter_url:
204
  # check if it is a sitemap or not
@@ -240,7 +246,7 @@ def main():
240
  start_index = i * split_size
241
  end_index = start_index + split_size if i != num_threads - 1 else None
242
  temp_urls = stored_sitemap_urls[start_index:end_index]
243
- future = executor.submit(process_urls, temp_urls)
244
  futures.append(future)
245
 
246
  # Retrieve the extracted data from each thread
@@ -284,7 +290,7 @@ def main():
284
 
285
  else:
286
  url = url_or_xml
287
- st.session_state.extracted_url, data_txt, data_jsonl = run_function(url)
288
 
289
 
290
  if st.session_state.extracted_url:
@@ -355,13 +361,16 @@ def main():
355
  if saved_successfully:
356
  # Confirmation message
357
  st.success(f"File saved successfully.")
358
-
 
359
  else:
360
  st.warning("Data not extracted")
361
  if st.button("clear"):
362
  st.session_state.button_enter_url = False
363
  st.session_state.extracted_url = False
364
  st.experimental_rerun()
 
 
365
 
366
 
367
  # Add a success message to the sidebar
 
125
 
126
 
127
  # function to process a batch of URLS in sitemaps
128
+ def process_urls(sitemap_urls , category):
129
 
130
  extracted_txt = ""
131
  extracted_jsonl_list= []
 
135
  # using justext to extract data
136
  temp_para = extract_data_from_url_(url)
137
  temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
138
+ temp_jsonl_data = {"text": temp_para, "url": url, "category": category, "timestamp": str(datetime.datetime.now())}
139
  extracted_txt += temp_txt_data
140
  extracted_jsonl_list.append(temp_jsonl_data)
141
  else:
 
150
 
151
 
152
  # function to process for a single URL
153
+ def run_function(url , category):
154
  extracted_txt = ""
155
  # Check if the user has provided a URL
156
  if url:
 
158
  temp_para = extract_data_from_url_(url)
159
  temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
160
  extracted_txt = temp_txt_data
161
+ extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
162
 
163
  # displaying extracted txt for single URL
164
  st.text_area("Extracted Text", value=extracted_txt, height=200)
 
180
  def main():
181
  st.subheader("Extract Data from URLs")
182
 
183
+ category = st.selectbox(
184
+ 'Select a Category',
185
+ ('News Articles','Poems','Magazines', 'Other') )
186
+
187
  # dividing the body section into 2 columns for url and enter button
188
  col1, col2 = st.columns([0.7,0.3])
189
 
 
203
  st.session_state.extracted_url = False
204
  data = ""
205
 
206
+
207
+
208
  # the enter button
209
  if st.session_state.button_enter_url:
210
  # check if it is a sitemap or not
 
246
  start_index = i * split_size
247
  end_index = start_index + split_size if i != num_threads - 1 else None
248
  temp_urls = stored_sitemap_urls[start_index:end_index]
249
+ future = executor.submit(process_urls, temp_urls, category)
250
  futures.append(future)
251
 
252
  # Retrieve the extracted data from each thread
 
290
 
291
  else:
292
  url = url_or_xml
293
+ st.session_state.extracted_url, data_txt, data_jsonl = run_function(url , category)
294
 
295
 
296
  if st.session_state.extracted_url:
 
361
  if saved_successfully:
362
  # Confirmation message
363
  st.success(f"File saved successfully.")
364
+ st.write("#")
365
+ st.write("#")
366
  else:
367
  st.warning("Data not extracted")
368
  if st.button("clear"):
369
  st.session_state.button_enter_url = False
370
  st.session_state.extracted_url = False
371
  st.experimental_rerun()
372
+ st.write("#")
373
+ st.write("#")
374
 
375
 
376
  # Add a success message to the sidebar