heymenn commited on
Commit
1c64a4a
1 Parent(s): 36947a6

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +19 -15
scrape_3gpp.py CHANGED
@@ -215,7 +215,7 @@ def remod_text(text):
215
  return text.replace("/n", "\n")
216
 
217
  def update_excel(data, excel_file, url):
218
- new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]
219
  temp_df = pd.DataFrame(data, columns=new_df_columns)
220
 
221
  try:
@@ -258,17 +258,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
258
 
259
 
260
  extract_directory = url.split("/")[-3] + "_extraction"
 
261
  categories = {
262
- "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
263
- "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
264
- "pCR":["URL", "File", "Type", "Title", "Source", "Content"],
265
- "LS": ["URL", "File", "Type", "Title", "Source", "Content"],
266
- "WID": ["URL", "File", "Type", "Title", "Source", "Content"],
267
- "SID": ["URL", "File", "Type", "Title", "Source", "Content"],
268
- "DISCUSSION": ["URL", "File", "Type", "Title", "Source", "Content"],
269
- "pdf": ["URL", "File", "Type", "Title", "Source", "Content"],
270
- "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
271
- "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
272
  }
273
 
274
  pourcents2=0.6
@@ -282,7 +283,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
282
  df = pd.read_excel(temp_excel)
283
  except Exception as e:
284
  print(f"Initializing a new DataFrame because: {e}")
285
- df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
286
 
287
  for folder in os.listdir(extract_directory):
288
  folder_path = os.path.join(extract_directory, folder)
@@ -455,24 +456,27 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
455
  # Assuming 'source' needs to be filled from the guide.xlsx mapping
456
  # Placeholder for source value calculation
457
  source = "" # Update this with actual source determination logic
 
458
  status = ""
459
- data.append([url+ "/" + folder + '.zip', folder , category, title, source,status, contenu])
460
 
461
  guide_file = 'guide.xlsx'
462
  if os.path.exists(guide_file):
463
  # If guide.xlsx exists, proceed with operations that require it
464
  try:
465
- guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'TDoc Status'])
466
  # Continue with the operations that require guide.xlsx
467
  # For example, reading the file, processing the data, etc.
468
  tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
 
469
  tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
470
  # Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
471
  for item in data:
472
  nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
473
  if nom_du_fichier in tdoc_source_map:
474
  item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
475
- item[5] = tdoc_status_map[nom_du_fichier]
 
476
  # Your code that depends on guide.xlsx goes here
477
 
478
  except Exception as e:
 
215
  return text.replace("/n", "\n")
216
 
217
  def update_excel(data, excel_file, url):
218
+ new_df_columns = ["URL", "File", "Type", "Title", "Source", "Related WIs", "Status", "Content"]
219
  temp_df = pd.DataFrame(data, columns=new_df_columns)
220
 
221
  try:
 
258
 
259
 
260
  extract_directory = url.split("/")[-3] + "_extraction"
261
+ TabCategories = ["URL", "File", "Title", "Source", "Related WIs", "Content"]
262
  categories = {
263
+ "Other": TabCategories,
264
+ "CR": TabCategories,
265
+ "pCR": TabCategories,
266
+ "LS": TabCategories,
267
+ "WID": TabCategories,
268
+ "SID": TabCategories,
269
+ "DISCUSSION": TabCategories,
270
+ "pdf": TabCategories,
271
+ "ppt": TabCategories,
272
+ "pptx": TabCategories
273
  }
274
 
275
  pourcents2=0.6
 
283
  df = pd.read_excel(temp_excel)
284
  except Exception as e:
285
  print(f"Initializing a new DataFrame because: {e}")
286
+ df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Related WIs", "Content"])
287
 
288
  for folder in os.listdir(extract_directory):
289
  folder_path = os.path.join(extract_directory, folder)
 
456
  # Assuming 'source' needs to be filled from the guide.xlsx mapping
457
  # Placeholder for source value calculation
458
  source = "" # Update this with actual source determination logic
459
+ RelatedWIs = ""
460
  status = ""
461
+ data.append([url+ "/" + folder + '.zip', folder , category, title, source, RelatedWIs, status, contenu])
462
 
463
  guide_file = 'guide.xlsx'
464
  if os.path.exists(guide_file):
465
  # If guide.xlsx exists, proceed with operations that require it
466
  try:
467
+ guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'RelatedWIs', 'TDoc Status'])
468
  # Continue with the operations that require guide.xlsx
469
  # For example, reading the file, processing the data, etc.
470
  tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
471
+ tdoc_relatedWIs_map = {row['TDoc']: row['Related WIs'] for index, row in guide_df.iterrows()}
472
  tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
473
  # Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
474
  for item in data:
475
  nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
476
  if nom_du_fichier in tdoc_source_map:
477
  item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
478
+ item[5] = tdoc_relatedWIs_map[nom_du_fichier]
479
+ item[6] = tdoc_status_map[nom_du_fichier]
480
  # Your code that depends on guide.xlsx goes here
481
 
482
  except Exception as e: