Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import requests | |
| import csv | |
| from bs4 import BeautifulSoup | |
| # Vérifie si la page contient des sous-catégories | |
| def has_subcategories(url): | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| return soup.find('table') is None | |
| # Enregistre les informations dans un fichier CSV | |
| def save_to_csv(data, csv_file): | |
| file_exists = os.path.exists(csv_file) | |
| with open(csv_file, mode='a', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| # Write headers only if the file is new or empty | |
| if not file_exists or os.stat(csv_file).st_size == 0: | |
| writer.writerow(["Id", "Catégorie", "Nom du document", "Lien", "Langue"]) | |
| writer.writerow(data) | |
| # Récupère le dernier ID à partir du fichier CSV | |
| def get_last_id(csv_file): | |
| if not os.path.exists(csv_file): | |
| return 1 # Commencer à 1 si le fichier n'existe pas | |
| with open(csv_file, mode='r', newline='', encoding='utf-8') as file: | |
| reader = csv.reader(file) | |
| last_row = list(reader)[-1] # Lire la dernière ligne | |
| return int(last_row[0]) + 1 # Retourner l'ID suivant | |
| # Scrape les informations des PDFs présents dans un tableau HTML | |
| def scrape_pdfs(url, category, id_counter, language, max_docs=None, processed_count=0): | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| table = soup.find('table') | |
| if not table: | |
| return id_counter, processed_count | |
| for row in table.find_all('tr'): | |
| if max_docs is not None and processed_count >= max_docs: | |
| return id_counter, processed_count | |
| first_td = row.find('td') | |
| if first_td: | |
| link = first_td.find('a') | |
| if link and 'href' in link.attrs: | |
| pdf_link = link['href'] | |
| title = link.get_text().strip() | |
| pdf_data = [id_counter, category, title, f'https://adala.justice.gov.ma{pdf_link}', language] | |
| save_to_csv(pdf_data, 'dataset/docs_metadata.csv') | |
| print(f"-> Ajouté : {pdf_data}") | |
| id_counter += 1 | |
| processed_count += 1 | |
| time.sleep(1) | |
| return id_counter, processed_count | |
| # Fonction principale qui explore récursivement les sous-catégories et scrape les PDF | |
| def scrape_documents(url, base_folder, id_counter, language, max_docs=None, processed_count=0): | |
| if max_docs is not None and processed_count >= max_docs: | |
| return id_counter, processed_count | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| h2 = soup.find('h2') | |
| category_name = h2.get_text().strip() if h2 else "Documents" | |
| if has_subcategories(url): | |
| main = soup.find('main') | |
| if main: | |
| sections = main.find_all('section') | |
| if sections: | |
| last_section = sections[-1] | |
| ul = last_section.find('ul') | |
| if ul: | |
| for a in ul.find_all('a'): | |
| if max_docs is not None and processed_count >= max_docs: | |
| break | |
| if 'href' in a.attrs: | |
| subcategory_name = a.get_text().strip().replace("Parcourir", "").strip() | |
| sub_link = a['href'] | |
| full_url = f'https://adala.justice.gov.ma{sub_link}' | |
| id_counter, processed_count = scrape_documents( | |
| full_url, subcategory_name, id_counter, language, max_docs, processed_count) | |
| else: | |
| id_counter, processed_count = scrape_pdfs( | |
| url, category_name, id_counter, language, max_docs, processed_count) | |
| return id_counter, processed_count | |