Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup as bs | |
import json | |
import argparse | |
from pathlib import Path | |
import os | |
import pandas as pd | |
import re | |
from tqdm import tqdm | |
def scrape_song_library(page_count=2054) -> pd.DataFrame: | |
columns = [ | |
"Title", | |
"Artist", | |
"Length", | |
"Tempo", | |
"Beat", | |
"Energy", | |
"Danceability", | |
"Valence", | |
"Sample", | |
"Tags", | |
"DanceRating", | |
] | |
song_df = pd.DataFrame(columns=columns) | |
for i in tqdm(range(1, page_count + 1), desc="Pages processed"): | |
link = "https://www.music4dance.net/song/Index?filter=v2-Index&page=" + str(i) | |
page = requests.get(link) | |
soup = bs(page.content, "html.parser") | |
songs = pd.DataFrame(get_songs(soup)) | |
song_df = pd.concat([song_df, songs], axis=0, ignore_index=True) | |
return song_df | |
def get_songs(soup: bs) -> dict: | |
js_obj = re.compile(r"{(.|\n)*}") | |
reset_keys = [ | |
"Title", | |
"Artist", | |
"Length", | |
"Tempo", | |
"Beat", | |
"Energy", | |
"Danceability", | |
"Valence", | |
"Sample", | |
] | |
song_text = [str(v) for v in soup.find_all("script") if "histories" in str(v)][0] | |
songs_data = json.loads(js_obj.search(song_text).group(0)) | |
songs = [] | |
for song_data in songs_data["histories"]: | |
song = {"Tags": set(), "DanceRating": {}} | |
for feature in song_data["properties"]: | |
if "name" not in feature or "value" not in feature: | |
continue | |
key = feature["name"] | |
value = feature["value"] | |
if key in reset_keys: | |
song[key] = value | |
elif key == "Tag+": | |
song["Tags"].add(value) | |
elif key == "DeleteTag": | |
try: | |
song["Tags"].remove(value) | |
except: | |
continue | |
elif key == "DanceRating": | |
dance = value.replace("+1", "") | |
prev = song["DanceRating"].get(dance, 0) | |
song["DanceRating"][dance] = prev + 1 | |
songs.append(song) | |
return songs | |
def scrape_dance_info() -> pd.DataFrame: | |
js_obj = re.compile(r"{(.|\n)*}") | |
link = "https://www.music4dance.net/song/Index?filter=v2-Index" | |
page = requests.get(link) | |
soup = bs(page.content, "html.parser") | |
dance_info_text = [str(v) for v in soup.find_all("script") if "environment" in str(v)][0] | |
dance_info = json.loads(js_obj.search(dance_info_text).group(0)) | |
dance_info = dance_info["dances"] | |
wanted_keys = ["name", "id", "synonyms", "tempoRange", "songCount"] | |
dance_df = pd.DataFrame([{k:v for k, v in dance.items() if k in wanted_keys} | |
for dance | |
in dance_info]) | |
return dance_df | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--page-count", default=2, type=int) | |
parser.add_argument("--out", default="data/song.csv") | |
args = parser.parse_args() | |
out_path = Path(args.out) | |
out_dir = os.path.dirname(out_path) | |
if not os.path.exists(out_dir): | |
print(f"Output location does not exist: {out_dir}") | |
df = scrape_song_library(args.page_count) | |
df.to_csv(out_path) | |