MaksG commited on
Commit
396839f
1 Parent(s): 0226df2

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +39 -44
scrape_3gpp.py CHANGED
@@ -9,31 +9,6 @@ import textract
9
  import gradio as gr
10
 
11
 
12
- def count_links(url):
13
- # Define common file extensions for downloadable content
14
- file_extensions = ('.zip')
15
-
16
- try:
17
- # Send a HTTP request to the URL
18
- response = requests.get(url)
19
- response.raise_for_status() # Raise an exception for HTTP errors
20
-
21
- # Parse the HTML content of the page
22
- soup = BeautifulSoup(response.text, 'html.parser')
23
-
24
- # Find all <a> tags in the HTML
25
- links = soup.find_all('a')
26
-
27
- # Count the number of links that point to downloadable files
28
- count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
29
-
30
- return count
31
- except requests.RequestException as e:
32
- print(f"Error fetching the page: {e}")
33
- return None
34
-
35
-
36
-
37
  def browse_folder(url):
38
  if url.lower().endswith(('docs', 'docs/')):
39
  return gr.update(choices=[])
@@ -87,11 +62,11 @@ def extract_statuses(url):
87
  return []
88
 
89
 
90
- def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress()):
91
  filenames = []
92
  status_filenames = []
93
  # Check if the excel_file argument is provided and if the file exists.
94
- excel_file_path = 'guide_status.xlsx' # Hardcoded path to the Excel file
95
 
96
  if os.path.exists(excel_file_path):
97
  try:
@@ -141,7 +116,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
141
  download_directory = folder_name
142
  if not os.path.exists(download_directory):
143
  os.makedirs(download_directory)
144
-
145
  pourcentss = 0.05
146
  print(f'filenames: {status_filenames}')
147
  if not filenames and not status_filenames:
@@ -157,11 +132,11 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
157
 
158
  # Filtrer les liens se terminant par ".zip"
159
  zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
160
-
161
  # Télécharger chaque fichier zip
162
  for zip_link in zip_links:
163
  progress(pourcentss,desc='Downloading')
164
- pourcentss+=0.4/count
165
  # Construire l'URL absolue du fichier zip
166
  absolute_url = urljoin(url, zip_link)
167
 
@@ -184,7 +159,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
184
  filename = os.path.basename(file_url)
185
  save_path = os.path.join(download_directory, filename)
186
  progress(pourcentss,desc='Downloading')
187
- pourcentss+=0.4/count
188
  try:
189
  with requests.get(file_url, stream=True) as r:
190
  r.raise_for_status()
@@ -210,14 +185,19 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
210
  print(f"HTTP error occurred: {file_url}: {e}")
211
  return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
212
 
213
- return True, "Téléchargement terminé !"
214
 
215
 
216
 
217
- def extractZip(folder_name):
218
  # Répertoire où les fichiers zip sont déjà téléchargés
219
- download_directory = folder_name
220
- extract_directory = folder_name + " extraction" # Répertoire où le contenu des fichiers zip sera extrait
 
 
 
 
 
221
 
222
  # Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
223
  for zip_file in os.listdir(download_directory):
@@ -233,6 +213,7 @@ def extractZip(folder_name):
233
  os.makedirs(extract_dir)
234
 
235
  # Extraire le contenu du fichier zip
 
236
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
237
  zip_ref.extractall(extract_dir)
238
 
@@ -242,6 +223,7 @@ def extractZip(folder_name):
242
 
243
  print("Toutes les extractions sont terminées !")
244
 
 
245
  def excel3gpp(url):
246
  response = requests.get(url)
247
  response.raise_for_status() # This will raise an exception if there's an error
@@ -263,12 +245,16 @@ def excel3gpp(url):
263
  excel_response.raise_for_status()
264
 
265
  # Define the path where you want to save the file
266
- filename = excel_url.split('/')[-1]
267
- filepath = os.path.join('path_to_save_directory', filename) # Replace 'path_to_save_directory' with your desired path
268
 
269
  # Write the content of the Excel file to a local file
270
  # Write the content of the Excel file to a local file named 'guide.xlsx'
271
- filepath = 'guide.xlsx' # Directly specify the filename
 
 
 
 
 
272
 
273
  with open(filepath, 'wb') as f:
274
  f.write(excel_response.content)
@@ -300,24 +286,32 @@ def update_excel(data, excel_file, url):
300
  print(f"Error updating Excel file: {e}")
301
 
302
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
303
- folder_name = 'nom provisoire'
304
- temp_excel = url.split("/")[-2] + "_status.xlsx"
 
 
 
 
 
 
 
 
305
  progress(0.0,desc='Downloading')
306
- count = count_links(url)
307
- result, message = scrape(url, excel_file, folder_name, status_list)
308
  if result:
309
  print("Success:", message)
310
  else:
311
  return(None, message)
312
 
313
  progress(0.4,desc='Extraction')
314
- extractZip(folder_name)
315
  progress(0.5,desc='Extraction 2')
316
  excel3gpp(url)
317
  progress(0.6,desc='Creating Excel File')
318
 
319
 
320
- extract_directory = folder_name +" extraction"
321
  categories = {
322
  "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
323
  "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
@@ -518,3 +512,4 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
518
  file_name = temp_excel
519
  # Save the updated DataFrame to Excel
520
  return file_name, "Téléchargement réussi"
 
 
9
  import gradio as gr
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def browse_folder(url):
13
  if url.lower().endswith(('docs', 'docs/')):
14
  return gr.update(choices=[])
 
62
  return []
63
 
64
 
65
+ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
66
  filenames = []
67
  status_filenames = []
68
  # Check if the excel_file argument is provided and if the file exists.
69
+ excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
70
 
71
  if os.path.exists(excel_file_path):
72
  try:
 
116
  download_directory = folder_name
117
  if not os.path.exists(download_directory):
118
  os.makedirs(download_directory)
119
+
120
  pourcentss = 0.05
121
  print(f'filenames: {status_filenames}')
122
  if not filenames and not status_filenames:
 
132
 
133
  # Filtrer les liens se terminant par ".zip"
134
  zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
135
+
136
  # Télécharger chaque fichier zip
137
  for zip_link in zip_links:
138
  progress(pourcentss,desc='Downloading')
139
+ pourcentss+=0.4/len(df)
140
  # Construire l'URL absolue du fichier zip
141
  absolute_url = urljoin(url, zip_link)
142
 
 
159
  filename = os.path.basename(file_url)
160
  save_path = os.path.join(download_directory, filename)
161
  progress(pourcentss,desc='Downloading')
162
+ pourcentss+=0.4/len(df)
163
  try:
164
  with requests.get(file_url, stream=True) as r:
165
  r.raise_for_status()
 
185
  print(f"HTTP error occurred: {file_url}: {e}")
186
  return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
187
 
188
+ return True, "Téléchargement terminé !", len(df)
189
 
190
 
191
 
192
+ def extractZip(url):
193
  # Répertoire où les fichiers zip sont déjà téléchargés
194
+ nom_extract = url.split("/")[-3] + "_extraction"
195
+ if os.path.exists(nom_extract):
196
+ shutil.rmtree(nom_extract)
197
+ extract_directory = nom_extract
198
+
199
+ download_directory = url.split("/")[-3] + "_downloads"
200
+ # Répertoire où le contenu des fichiers zip sera extrait
201
 
202
  # Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
203
  for zip_file in os.listdir(download_directory):
 
213
  os.makedirs(extract_dir)
214
 
215
  # Extraire le contenu du fichier zip
216
+
217
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
218
  zip_ref.extractall(extract_dir)
219
 
 
223
 
224
  print("Toutes les extractions sont terminées !")
225
 
226
+
227
  def excel3gpp(url):
228
  response = requests.get(url)
229
  response.raise_for_status() # This will raise an exception if there's an error
 
245
  excel_response.raise_for_status()
246
 
247
  # Define the path where you want to save the file
248
+ # Replace 'path_to_save_directory' with your desired path
 
249
 
250
  # Write the content of the Excel file to a local file
251
  # Write the content of the Excel file to a local file named 'guide.xlsx'
252
+
253
+ nom_guide = 'guide.xlsx' # Directly specify the filename
254
+ if os.path.exists(nom_guide):
255
+ os.remove(nom_guide)
256
+ filepath = nom_guide
257
+
258
 
259
  with open(filepath, 'wb') as f:
260
  f.write(excel_response.content)
 
286
  print(f"Error updating Excel file: {e}")
287
 
288
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
289
+ nom_download = url.split("/")[-3] + "_downloads"
290
+ if os.path.exists(nom_download):
291
+ shutil.rmtree(nom_download)
292
+ folder_name = nom_download
293
+
294
+ nom_status = url.split("/")[-3] + "_status.xlsx"
295
+ if os.path.exists(nom_status):
296
+ os.remove(nom_status)
297
+ temp_excel = nom_status
298
+
299
  progress(0.0,desc='Downloading')
300
+
301
+ result, message, count = scrape(url, excel_file, folder_name, status_list)
302
  if result:
303
  print("Success:", message)
304
  else:
305
  return(None, message)
306
 
307
  progress(0.4,desc='Extraction')
308
+ extractZip(url)
309
  progress(0.5,desc='Extraction 2')
310
  excel3gpp(url)
311
  progress(0.6,desc='Creating Excel File')
312
 
313
 
314
+ extract_directory = url.split("/")[-3] + "_extraction"
315
  categories = {
316
  "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
317
  "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
 
512
  file_name = temp_excel
513
  # Save the updated DataFrame to Excel
514
  return file_name, "Téléchargement réussi"
515
+