Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 6

Commit

b6d3aa2

•

1 Parent(s): 92d0a3c

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +93 -13

scrape_3gpp.py CHANGED Viewed

@@ -22,9 +22,40 @@ def browse_folder(url):
     return gr.update(choices=excel_links)
-def scrape(url, excel_file, folder_name,progress=gr.Progress()):
     filenames = []
     # Check if the excel_file argument is provided and if the file exists.
     if excel_file and os.path.exists(excel_file):
         try:
             df = pd.read_excel(excel_file)
@@ -34,7 +65,7 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
                 df = df[df['Actions'] == 'x']
             elif 'File' in df.columns:
-                filenames = [f"{url}/{row['File']}.zip" for index, row in df.iterrows()]
             elif 'URL' in df.columns:
                 filenames = df['URL'].tolist()
         except Exception as e:
@@ -46,8 +77,8 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
-    if not filenames:
         print("No Excel file provided, or no valid URLs found in the file.")
         # You can either return here or continue with other predefined logic
         response = requests.get(url)
@@ -85,7 +116,20 @@ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
                     for chunk in r.iter_content(chunk_size=8192):
                         f.write(chunk)
     else:
     # Proceed with downloading files using the filenames list
@@ -126,7 +170,6 @@ def extractZip(folder_name):
                     os.makedirs(extract_dir)
                 # Extraire le contenu du fichier zip
-                print(f"Extraction en cours pour {zip_file}")
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall(extract_dir)
@@ -176,10 +219,28 @@ def replace_line_breaks(text):
 def remod_text(text):
     return text.replace("/n", "\n")
-def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
-    folder_name = url.split("/")[-2]
-    progress(0.1,desc='Telechargement')
-    result, message = scrape(url, excel_file, folder_name)
     if result:
         print("Success:", message)
     else:
@@ -191,6 +252,7 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
     excel3gpp(url)
     progress(0.6,desc='Mise en forme Excel')
     extract_directory = folder_name +" extraction"
     categories = {
         "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
@@ -204,18 +266,28 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
         "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
         "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
     }
-    nouv=0
     num=0.6
     data = []
     errors_count = 0
     pre_title_section = None
     for folder in os.listdir(extract_directory):
         folder_path = os.path.join(extract_directory, folder)
         if os.path.isdir(folder_path):
             for file in os.listdir(folder_path):
-                num=num + nouv/400
                 progress(num,desc='Mise en forme Excel')
-                nouv+=1
                 if file == "__MACOSX":
                     continue
                 file_path = os.path.join(folder_path, file)
@@ -366,6 +438,14 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
                             item[5] = tdoc_status_map[nom_du_fichier]
     new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]  # Create a DataFrame with the updated data
@@ -401,4 +481,4 @@ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
     file_name = url.split("/")[-2] + ".xlsx"
     # Save the updated DataFrame to Excel
     df.to_excel(file_name, index=False)
-    return file_name, "Téléchargement réussi"

     return gr.update(choices=excel_links)
+def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
+    status_filenames = []
     # Check if the excel_file argument is provided and if the file exists.
+    excel_file_path = '/content/guide_status.xlsx'  # Hardcoded path to the Excel file
+    if os.path.exists(excel_file_path):
+        try:
+            df = pd.read_excel(excel_file_path)
+            print(f"Initial DataFrame size: {len(df)}")
+            if 'TDoc Status' in df.columns:
+                df = df[df['TDoc Status'].isin(status_list)]
+                print(f"Filtered DataFrame size: {len(df)}")
+                if df.empty:
+                    print("No files match the specified 'TDoc Status'.")
+                else:
+                    if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
+                        status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
+                    elif 'URL' in df.columns and not df['URL'].isnull().all():
+                        status_filenames = df['URL'].tolist()
+                    else:
+                        print("No valid 'File' or 'URL' entries found for the filtered statuses.")
+                    print(f"Filenames: {filenames}")
+            else:
+                print("'TDoc Status' column not found in the Excel file.")
+        except Exception as e:
+            print(f"Error reading Excel file: {e}")
     if excel_file and os.path.exists(excel_file):
         try:
             df = pd.read_excel(excel_file)
                 df = df[df['Actions'] == 'x']
             elif 'File' in df.columns:
+                filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
             elif 'URL' in df.columns:
                 filenames = df['URL'].tolist()
         except Exception as e:
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
+    print(f'filenames: {status_filenames}')
+    if not filenames and not status_filenames:
         print("No Excel file provided, or no valid URLs found in the file.")
         # You can either return here or continue with other predefined logic
         response = requests.get(url)
                     for chunk in r.iter_content(chunk_size=8192):
                         f.write(chunk)
+    elif not filenames:
+    # Proceed with downloading files using the filenames list
+      for file_url in status_filenames:
+          filename = os.path.basename(file_url)
+          save_path = os.path.join(download_directory, filename)
+          try:
+              with requests.get(file_url, stream=True) as r:
+                  r.raise_for_status()
+                  with open(save_path, 'wb') as f:
+                      for chunk in r.iter_content(chunk_size=8192):
+                          f.write(chunk)
+          except requests.exceptions.HTTPError as e:
+              print(f"skipped file: {file_url}: {e}")
     else:
     # Proceed with downloading files using the filenames list
                     os.makedirs(extract_dir)
                 # Extraire le contenu du fichier zip
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall(extract_dir)
 def remod_text(text):
     return text.replace("/n", "\n")
+def update_excel(data, excel_file, url):
+    new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]
+    temp_df = pd.DataFrame(data, columns=new_df_columns)
+    try:
+        # Load the existing Excel file if it exists, else create a new one
+        if os.path.exists(excel_file):
+            old_df = pd.read_excel(excel_file)
+            df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
+        else:
+            df = temp_df
+        # Save the updated data back to the Excel file
+        df.to_excel(excel_file, index=False)
+    except Exception as e:
+        print(f"Error updating Excel file: {e}")
+def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
+    folder_name = 'nom provisoire'
+    temp_excel = '/content/temporaire.xlsx'
+    progress(0.0,desc='Telechargement')
+    result, message = scrape(url, excel_file, folder_name, status_list)
     if result:
         print("Success:", message)
     else:
     excel3gpp(url)
     progress(0.6,desc='Mise en forme Excel')
     extract_directory = folder_name +" extraction"
     categories = {
         "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
         "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
         "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
     }
     num=0.6
     data = []
     errors_count = 0
+    processed_count = 0   # Counter for processed files
     pre_title_section = None
+    try:
+        df = pd.read_excel(excel_file)
+    except Exception as e:
+        print(f"Initializing a new DataFrame because: {e}")
+        df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
     for folder in os.listdir(extract_directory):
         folder_path = os.path.join(extract_directory, folder)
         if os.path.isdir(folder_path):
             for file in os.listdir(folder_path):
+                num = min(num + 0.001, 0.9)
                 progress(num,desc='Mise en forme Excel')
                 if file == "__MACOSX":
                     continue
                 file_path = os.path.join(folder_path, file)
                             item[5] = tdoc_status_map[nom_du_fichier]
+                    processed_count += 1
+                # Check if it's time to update the Excel file
+                    if processed_count % 20 == 0:
+                        update_excel(data, temp_excel, url)
+                        print(f"Updated after processing {processed_count} files.")
+                        data = []  # Clear the data list after updating
     new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]  # Create a DataFrame with the updated data
     file_name = url.split("/")[-2] + ".xlsx"
     # Save the updated DataFrame to Excel
     df.to_excel(file_name, index=False)
+    return file_name, "Téléchargement réussi"