Spaces:

nomnomnonono
/

Sound-Effect-Search

Running

App Files Files Community

Sound-Effect-Search / src /scrape.py

nomnomnonono

initial

f41efe1 over 1 year ago

raw

history blame

4.63 kB

	import argparse
	import glob
	import os
	import time
	import urllib

	import librosa
	import pandas as pd
	import requests
	import soundfile as sf
	from bs4 import BeautifulSoup
	from omegaconf import OmegaConf
	from pydub import AudioSegment
	from requests.exceptions import Timeout


	class Scraper:
	def __init__(self, config):
	self.base_url = "https://soundeffect-lab.info/"
	self.df = pd.DataFrame([], columns=["filename", "title", "category", "url"])
	self.idx = 0
	self.config = OmegaConf.load(config)
	self.setup()
	os.makedirs(self.config.path_data, exist_ok=True)
	self.history = []

	def run(self):
	self.all_get()
	self.preprocess()

	def setup(self):
	try:
	html = requests.get(self.base_url, timeout=5)
	except Timeout:
	raise ValueError("Time Out")
	soup = BeautifulSoup(html.content, "html.parser")
	tags = soup.select("a")
	self.urls = []
	self.categories = []
	for tag in tags:
	category = tag.text
	url = tag.get("href")
	if "/sound/" in url:
	self.urls.append(url)
	self.categories.append(category)

	def all_get(self):
	for i in range(len(self.urls)):
	now_url = self.base_url + self.urls[i][1:]
	self.download(now_url, self.categories[i])
	self.df.to_csv(self.config.path_csv)

	def download(self, now_url, category):
	try:
	html = requests.get(now_url, timeout=5)
	soup = BeautifulSoup(html.content, "html.parser")
	body = soup.find(id="wrap").find("main")
	tags = body.find(id="playarea").select("a")
	count = 0
	for tag in tags:
	name = tag.get("download")
	url = tag.get("href")
	filename = os.path.join(self.config.path_data, name)
	if os.path.exists(filename):
	continue
	try:
	urllib.request.urlretrieve(now_url + url, filename)
	title = name.replace(".mp3", "")
	self.df.loc[self.idx] = {
	"filename": filename,
	"title": title,
	"category": category,
	"url": f"https://soundeffect-lab.info/sound/search.php?s={title}",
	}
	self.idx += 1
	time.sleep(2)
	count += 1
	except Exception:
	continue
	self.history.append(category)
	print(now_url, category, len(tags), count)
	paths = glob.glob(os.path.join(self.config.path_data, "*"))
	assert len(paths) == len(self.df)

	others = body.find(id="pagemenu-top").select("a")
	other_urls, other_categories = [], []
	for other in others:
	other_url = other.get("href")
	other_name = other.find("img").get("alt")
	if other_name in self.history:
	continue
	other_urls.append(other_url)
	other_categories.append(other_name)
	for i in range(len(other_urls)):
	self.download(self.base_url + other_urls[i][1:], other_categories[i])
	except Timeout:
	print(f"Time Out: {now_url}")

	def preprocess(self):
	for i in range(len(self.df)):
	song = AudioSegment.from_mp3(
	os.path.join(self.config.path_data, self.df.iloc[i]["filename"])
	)
	song.export(
	os.path.join(
	self.config.path_data,
	self.df.iloc[i]["filename"].replace(".mp3", ".wav"),
	),
	format="wav",
	)

	for i in range(len(self.df)):
	file = os.path.join(
	self.config.path_data,
	self.df.iloc[i]["filename"].replace(".mp3", ".wav"),
	)
	y, sr = librosa.core.load(file, sr=self.config.sample_rate, mono=True)
	dir, name = os.path.split(file)
	sf.write(os.path.join(dir, "new_" + name), y, sr, subtype="PCM_16")


	def argparser():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-c",
	"--config",
	type=str,
	default="config.yaml",
	help="File path for config file.",
	)
	args = parser.parse_args()
	return args


	if __name__ == "__main__":
	args = argparser()
	scraper = Scraper(args.config)
	scraper.run()