MaksG commited on
Commit
b6d3aa2
1 Parent(s): 92d0a3c

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +93 -13
scrape_3gpp.py CHANGED
@@ -22,9 +22,40 @@ def browse_folder(url):
22
  return gr.update(choices=excel_links)
23
 
24
 
25
- def scrape(url, excel_file, folder_name,progress=gr.Progress()):
26
  filenames = []
 
27
  # Check if the excel_file argument is provided and if the file exists.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  if excel_file and os.path.exists(excel_file):
29
  try:
30
  df = pd.read_excel(excel_file)
@@ -34,7 +65,7 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
34
  df = df[df['Actions'] == 'x']
35
 
36
  elif 'File' in df.columns:
37
- filenames = [f"{url}/{row['File']}.zip" for index, row in df.iterrows()]
38
  elif 'URL' in df.columns:
39
  filenames = df['URL'].tolist()
40
  except Exception as e:
@@ -46,8 +77,8 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
46
  if not os.path.exists(download_directory):
47
  os.makedirs(download_directory)
48
 
49
-
50
- if not filenames:
51
  print("No Excel file provided, or no valid URLs found in the file.")
52
  # You can either return here or continue with other predefined logic
53
  response = requests.get(url)
@@ -85,7 +116,20 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
85
  for chunk in r.iter_content(chunk_size=8192):
86
  f.write(chunk)
87
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
 
89
 
90
  else:
91
  # Proceed with downloading files using the filenames list
@@ -126,7 +170,6 @@ def extractZip(folder_name):
126
  os.makedirs(extract_dir)
127
 
128
  # Extraire le contenu du fichier zip
129
- print(f"Extraction en cours pour {zip_file}")
130
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
131
  zip_ref.extractall(extract_dir)
132
 
@@ -176,10 +219,28 @@ def replace_line_breaks(text):
176
  def remod_text(text):
177
  return text.replace("/n", "\n")
178
 
179
- def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
180
- folder_name = url.split("/")[-2]
181
- progress(0.1,desc='Telechargement')
182
- result, message = scrape(url, excel_file, folder_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  if result:
184
  print("Success:", message)
185
  else:
@@ -191,6 +252,7 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
191
  excel3gpp(url)
192
  progress(0.6,desc='Mise en forme Excel')
193
 
 
194
  extract_directory = folder_name +" extraction"
195
  categories = {
196
  "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
@@ -204,18 +266,28 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
204
  "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
205
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
206
  }
207
- nouv=0
208
  num=0.6
209
  data = []
210
  errors_count = 0
 
 
211
  pre_title_section = None
 
 
 
 
 
 
 
212
  for folder in os.listdir(extract_directory):
213
  folder_path = os.path.join(extract_directory, folder)
214
  if os.path.isdir(folder_path):
215
  for file in os.listdir(folder_path):
216
- num=num + nouv/400
217
  progress(num,desc='Mise en forme Excel')
218
- nouv+=1
 
219
  if file == "__MACOSX":
220
  continue
221
  file_path = os.path.join(folder_path, file)
@@ -366,6 +438,14 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
366
  item[5] = tdoc_status_map[nom_du_fichier]
367
 
368
 
 
 
 
 
 
 
 
 
369
 
370
 
371
  new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"] # Create a DataFrame with the updated data
@@ -401,4 +481,4 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
401
  file_name = url.split("/")[-2] + ".xlsx"
402
  # Save the updated DataFrame to Excel
403
  df.to_excel(file_name, index=False)
404
- return file_name, "Téléchargement réussi"
 
22
  return gr.update(choices=excel_links)
23
 
24
 
25
+ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
26
  filenames = []
27
+ status_filenames = []
28
  # Check if the excel_file argument is provided and if the file exists.
29
+ excel_file_path = '/content/guide_status.xlsx' # Hardcoded path to the Excel file
30
+
31
+ if os.path.exists(excel_file_path):
32
+ try:
33
+ df = pd.read_excel(excel_file_path)
34
+ print(f"Initial DataFrame size: {len(df)}")
35
+
36
+ if 'TDoc Status' in df.columns:
37
+ df = df[df['TDoc Status'].isin(status_list)]
38
+ print(f"Filtered DataFrame size: {len(df)}")
39
+
40
+ if df.empty:
41
+ print("No files match the specified 'TDoc Status'.")
42
+ else:
43
+ if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
44
+ status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
45
+ elif 'URL' in df.columns and not df['URL'].isnull().all():
46
+ status_filenames = df['URL'].tolist()
47
+ else:
48
+ print("No valid 'File' or 'URL' entries found for the filtered statuses.")
49
+
50
+ print(f"Filenames: {filenames}")
51
+ else:
52
+ print("'TDoc Status' column not found in the Excel file.")
53
+
54
+ except Exception as e:
55
+ print(f"Error reading Excel file: {e}")
56
+
57
+
58
+
59
  if excel_file and os.path.exists(excel_file):
60
  try:
61
  df = pd.read_excel(excel_file)
 
65
  df = df[df['Actions'] == 'x']
66
 
67
  elif 'File' in df.columns:
68
+ filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
69
  elif 'URL' in df.columns:
70
  filenames = df['URL'].tolist()
71
  except Exception as e:
 
77
  if not os.path.exists(download_directory):
78
  os.makedirs(download_directory)
79
 
80
+ print(f'filenames: {status_filenames}')
81
+ if not filenames and not status_filenames:
82
  print("No Excel file provided, or no valid URLs found in the file.")
83
  # You can either return here or continue with other predefined logic
84
  response = requests.get(url)
 
116
  for chunk in r.iter_content(chunk_size=8192):
117
  f.write(chunk)
118
 
119
+ elif not filenames:
120
+ # Proceed with downloading files using the filenames list
121
+ for file_url in status_filenames:
122
+ filename = os.path.basename(file_url)
123
+ save_path = os.path.join(download_directory, filename)
124
 
125
+ try:
126
+ with requests.get(file_url, stream=True) as r:
127
+ r.raise_for_status()
128
+ with open(save_path, 'wb') as f:
129
+ for chunk in r.iter_content(chunk_size=8192):
130
+ f.write(chunk)
131
+ except requests.exceptions.HTTPError as e:
132
+ print(f"skipped file: {file_url}: {e}")
133
 
134
  else:
135
  # Proceed with downloading files using the filenames list
 
170
  os.makedirs(extract_dir)
171
 
172
  # Extraire le contenu du fichier zip
 
173
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
174
  zip_ref.extractall(extract_dir)
175
 
 
219
  def remod_text(text):
220
  return text.replace("/n", "\n")
221
 
222
+ def update_excel(data, excel_file, url):
223
+ new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]
224
+ temp_df = pd.DataFrame(data, columns=new_df_columns)
225
+
226
+ try:
227
+ # Load the existing Excel file if it exists, else create a new one
228
+ if os.path.exists(excel_file):
229
+ old_df = pd.read_excel(excel_file)
230
+ df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
231
+ else:
232
+ df = temp_df
233
+
234
+ # Save the updated data back to the Excel file
235
+ df.to_excel(excel_file, index=False)
236
+ except Exception as e:
237
+ print(f"Error updating Excel file: {e}")
238
+
239
+ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
240
+ folder_name = 'nom provisoire'
241
+ temp_excel = '/content/temporaire.xlsx'
242
+ progress(0.0,desc='Telechargement')
243
+ result, message = scrape(url, excel_file, folder_name, status_list)
244
  if result:
245
  print("Success:", message)
246
  else:
 
252
  excel3gpp(url)
253
  progress(0.6,desc='Mise en forme Excel')
254
 
255
+
256
  extract_directory = folder_name +" extraction"
257
  categories = {
258
  "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
 
266
  "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
267
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
268
  }
269
+
270
  num=0.6
271
  data = []
272
  errors_count = 0
273
+ processed_count = 0 # Counter for processed files
274
+
275
  pre_title_section = None
276
+
277
+ try:
278
+ df = pd.read_excel(excel_file)
279
+ except Exception as e:
280
+ print(f"Initializing a new DataFrame because: {e}")
281
+ df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
282
+
283
  for folder in os.listdir(extract_directory):
284
  folder_path = os.path.join(extract_directory, folder)
285
  if os.path.isdir(folder_path):
286
  for file in os.listdir(folder_path):
287
+ num = min(num + 0.001, 0.9)
288
  progress(num,desc='Mise en forme Excel')
289
+
290
+
291
  if file == "__MACOSX":
292
  continue
293
  file_path = os.path.join(folder_path, file)
 
438
  item[5] = tdoc_status_map[nom_du_fichier]
439
 
440
 
441
+ processed_count += 1
442
+
443
+ # Check if it's time to update the Excel file
444
+ if processed_count % 20 == 0:
445
+ update_excel(data, temp_excel, url)
446
+ print(f"Updated after processing {processed_count} files.")
447
+ data = [] # Clear the data list after updating
448
+
449
 
450
 
451
  new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"] # Create a DataFrame with the updated data
 
481
  file_name = url.split("/")[-2] + ".xlsx"
482
  # Save the updated DataFrame to Excel
483
  df.to_excel(file_name, index=False)
484
+ return file_name, "Téléchargement réussi"