| import argparse | |
| import glob | |
| import os | |
| import time | |
| import urllib | |
| import librosa | |
| import pandas as pd | |
| import requests | |
| import soundfile as sf | |
| from bs4 import BeautifulSoup | |
| from omegaconf import OmegaConf | |
| from pydub import AudioSegment | |
| from requests.exceptions import Timeout | |
| class Scraper: | |
| def __init__(self, config): | |
| self.base_url = "https://soundeffect-lab.info/" | |
| self.df = pd.DataFrame([], columns=["filename", "title", "category", "url"]) | |
| self.idx = 0 | |
| self.config = OmegaConf.load(config) | |
| self.setup() | |
| os.makedirs(self.config.path_data, exist_ok=True) | |
| self.history = [] | |
| def run(self): | |
| self.all_get() | |
| self.preprocess() | |
| def setup(self): | |
| try: | |
| html = requests.get(self.base_url, timeout=5) | |
| except Timeout: | |
| raise ValueError("Time Out") | |
| soup = BeautifulSoup(html.content, "html.parser") | |
| tags = soup.select("a") | |
| self.urls = [] | |
| self.categories = [] | |
| for tag in tags: | |
| category = tag.text | |
| url = tag.get("href") | |
| if "/sound/" in url: | |
| self.urls.append(url) | |
| self.categories.append(category) | |
| def all_get(self): | |
| for i in range(len(self.urls)): | |
| now_url = self.base_url + self.urls[i][1:] | |
| self.download(now_url, self.categories[i]) | |
| self.df.to_csv(self.config.path_csv) | |
| def download(self, now_url, category): | |
| try: | |
| html = requests.get(now_url, timeout=5) | |
| soup = BeautifulSoup(html.content, "html.parser") | |
| body = soup.find(id="wrap").find("main") | |
| tags = body.find(id="playarea").select("a") | |
| count = 0 | |
| for tag in tags: | |
| name = tag.get("download") | |
| url = tag.get("href") | |
| filename = os.path.join(self.config.path_data, name) | |
| if os.path.exists(filename): | |
| continue | |
| try: | |
| urllib.request.urlretrieve(now_url + url, filename) | |
| title = name.replace(".mp3", "") | |
| self.df.loc[self.idx] = { | |
| "filename": filename, | |
| "title": title, | |
| "category": category, | |
| "url": f"https://soundeffect-lab.info/sound/search.php?s={title}", | |
| } | |
| self.idx += 1 | |
| time.sleep(2) | |
| count += 1 | |
| except Exception: | |
| continue | |
| self.history.append(category) | |
| print(now_url, category, len(tags), count) | |
| paths = glob.glob(os.path.join(self.config.path_data, "*")) | |
| assert len(paths) == len(self.df) | |
| others = body.find(id="pagemenu-top").select("a") | |
| other_urls, other_categories = [], [] | |
| for other in others: | |
| other_url = other.get("href") | |
| other_name = other.find("img").get("alt") | |
| if other_name in self.history: | |
| continue | |
| other_urls.append(other_url) | |
| other_categories.append(other_name) | |
| for i in range(len(other_urls)): | |
| self.download(self.base_url + other_urls[i][1:], other_categories[i]) | |
| except Timeout: | |
| print(f"Time Out: {now_url}") | |
| def preprocess(self): | |
| for i in range(len(self.df)): | |
| song = AudioSegment.from_mp3( | |
| os.path.join(self.config.path_data, self.df.iloc[i]["filename"]) | |
| ) | |
| song.export( | |
| os.path.join( | |
| self.config.path_data, | |
| self.df.iloc[i]["filename"].replace(".mp3", ".wav"), | |
| ), | |
| format="wav", | |
| ) | |
| for i in range(len(self.df)): | |
| file = os.path.join( | |
| self.config.path_data, | |
| self.df.iloc[i]["filename"].replace(".mp3", ".wav"), | |
| ) | |
| y, sr = librosa.core.load(file, sr=self.config.sample_rate, mono=True) | |
| dir, name = os.path.split(file) | |
| sf.write(os.path.join(dir, "new_" + name), y, sr, subtype="PCM_16") | |
| def argparser(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "-c", | |
| "--config", | |
| type=str, | |
| default="config.yaml", | |
| help="File path for config file.", | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| if __name__ == "__main__": | |
| args = argparser() | |
| scraper = Scraper(args.config) | |
| scraper.run() | |