|
import argparse |
|
import glob |
|
import os |
|
import time |
|
import urllib |
|
|
|
import librosa |
|
import pandas as pd |
|
import requests |
|
import soundfile as sf |
|
from bs4 import BeautifulSoup |
|
from omegaconf import OmegaConf |
|
from pydub import AudioSegment |
|
from requests.exceptions import Timeout |
|
|
|
|
|
class Scraper: |
|
def __init__(self, config): |
|
self.base_url = "https://soundeffect-lab.info/" |
|
self.df = pd.DataFrame([], columns=["filename", "title", "category", "url"]) |
|
self.idx = 0 |
|
self.config = OmegaConf.load(config) |
|
self.setup() |
|
os.makedirs(self.config.path_data, exist_ok=True) |
|
self.history = [] |
|
|
|
def run(self): |
|
self.all_get() |
|
self.preprocess() |
|
|
|
def setup(self): |
|
try: |
|
html = requests.get(self.base_url, timeout=5) |
|
except Timeout: |
|
raise ValueError("Time Out") |
|
soup = BeautifulSoup(html.content, "html.parser") |
|
tags = soup.select("a") |
|
self.urls = [] |
|
self.categories = [] |
|
for tag in tags: |
|
category = tag.text |
|
url = tag.get("href") |
|
if "/sound/" in url: |
|
self.urls.append(url) |
|
self.categories.append(category) |
|
|
|
def all_get(self): |
|
for i in range(len(self.urls)): |
|
now_url = self.base_url + self.urls[i][1:] |
|
self.download(now_url, self.categories[i]) |
|
self.df.to_csv(self.config.path_csv) |
|
|
|
def download(self, now_url, category): |
|
try: |
|
html = requests.get(now_url, timeout=5) |
|
soup = BeautifulSoup(html.content, "html.parser") |
|
body = soup.find(id="wrap").find("main") |
|
tags = body.find(id="playarea").select("a") |
|
count = 0 |
|
for tag in tags: |
|
name = tag.get("download") |
|
url = tag.get("href") |
|
filename = os.path.join(self.config.path_data, name) |
|
if os.path.exists(filename): |
|
continue |
|
try: |
|
urllib.request.urlretrieve(now_url + url, filename) |
|
title = name.replace(".mp3", "") |
|
self.df.loc[self.idx] = { |
|
"filename": filename, |
|
"title": title, |
|
"category": category, |
|
"url": f"https://soundeffect-lab.info/sound/search.php?s={title}", |
|
} |
|
self.idx += 1 |
|
time.sleep(2) |
|
count += 1 |
|
except Exception: |
|
continue |
|
self.history.append(category) |
|
print(now_url, category, len(tags), count) |
|
paths = glob.glob(os.path.join(self.config.path_data, "*")) |
|
assert len(paths) == len(self.df) |
|
|
|
others = body.find(id="pagemenu-top").select("a") |
|
other_urls, other_categories = [], [] |
|
for other in others: |
|
other_url = other.get("href") |
|
other_name = other.find("img").get("alt") |
|
if other_name in self.history: |
|
continue |
|
other_urls.append(other_url) |
|
other_categories.append(other_name) |
|
for i in range(len(other_urls)): |
|
self.download(self.base_url + other_urls[i][1:], other_categories[i]) |
|
except Timeout: |
|
print(f"Time Out: {now_url}") |
|
|
|
def preprocess(self): |
|
for i in range(len(self.df)): |
|
song = AudioSegment.from_mp3( |
|
os.path.join(self.config.path_data, self.df.iloc[i]["filename"]) |
|
) |
|
song.export( |
|
os.path.join( |
|
self.config.path_data, |
|
self.df.iloc[i]["filename"].replace(".mp3", ".wav"), |
|
), |
|
format="wav", |
|
) |
|
|
|
for i in range(len(self.df)): |
|
file = os.path.join( |
|
self.config.path_data, |
|
self.df.iloc[i]["filename"].replace(".mp3", ".wav"), |
|
) |
|
y, sr = librosa.core.load(file, sr=self.config.sample_rate, mono=True) |
|
dir, name = os.path.split(file) |
|
sf.write(os.path.join(dir, "new_" + name), y, sr, subtype="PCM_16") |
|
|
|
|
|
def argparser(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"-c", |
|
"--config", |
|
type=str, |
|
default="config.yaml", |
|
help="File path for config file.", |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
if __name__ == "__main__": |
|
args = argparser() |
|
scraper = Scraper(args.config) |
|
scraper.run() |
|
|