Spaces:
Sleeping
Sleeping
File size: 3,459 Bytes
54b5359 ba0be00 ea492b0 a075e08 54b5359 d24432a 8547a33 5245643 636f054 ea492b0 636f054 d24432a 54b5359 36ab544 636f054 54b5359 67b36cf 36ab544 54b5359 36ab544 888a131 36ab544 636f054 888a131 36ab544 a075e08 d12d016 a075e08 54b5359 636f054 36ab544 54b5359 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm.auto import tqdm
import streamlit as st
from huggingface_hub import Repository, HfApi, HfFolder
import os
import random
tqdm.pandas()
api = HfApi()
token = os.getenv("token") # Das Token wird aus den Hugging Face Secrets abgerufen
tokenread = os.getenv("tokenread")
localdir = "SpotifyHitPrediction"
repo = Repository(local_dir=localdir, clone_from="https://huggingface.co/spaces/Add1E/SpotifyHitPrediction", token=token)
def remove_non_utf8_characters(text):
# Encode using UTF-8, ignore errors
encoded_string = text.encode('utf-8', 'remove')
# Decode back to string
return encoded_string.decode('utf-8')
def predict_popularity(features,trainset):
predictions = [None] * 2
predictions[0], predictions[1] = rf_model.predict([features]), model.predict([features])
old_df = pd.read_csv(f'{localdir}/top50.csv', encoding='utf-8')
addToCsvAndTrain(trainset, old_df)
st.write("Regression :")
st.code(f"MeanSquaredError: {mse}, rSqared: {r2}")
st.write("Random Forest :")
st.code(f"MeanSquaredError: {rf_mse}, rSqared: {rf_r2}")
return predictions
def addToCsvAndTrain(trainset, old_df):
trainset = [
[trainset[0], remove_non_utf8_characters(trainset[1]), remove_non_utf8_characters(trainset[2]), trainset[3], trainset[4], trainset[5], trainset[6], trainset[7],
trainset[8], trainset[9], trainset[10], trainset[11], trainset[12], trainset[13]
]
]
neues_df = pd.DataFrame(trainset, columns= data.columns)
df = pd.concat([old_df, neues_df], ignore_index=True)
df.to_csv(f'{localdir}/top50.csv', index=False, encoding='utf-8')
if(random.randint(1, 10) == 7):
st.session_state['reset'] = 1
repo.git_add(os.path.abspath(f'{localdir}/top50.csv'))
repo.git_commit("Add top50.csv")
repo.git_push()
data = pd.read_csv('top50.csv', encoding='utf-8')
print(data.head())
# Let's also describe the data to get a sense of the distributions
print(data.describe())
# Selecting the features and the target variable
X = data.drop(['Unnamed: 0', 'Track.Name', 'Artist.Name', 'Genre', 'Popularity'], axis=1)
y = data['Popularity']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initializing the Linear Regression model
model = LinearRegression()
# Fitting the model
model.fit(X_train, y_train)
# Making predictions
y_pred = model.predict(X_test)
# Calculating the performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# Fitting the model
rf_model.fit(X_train, y_train)
# Making predictions
rf_pred = rf_model.predict(X_test)
# Calculating the performance metrics
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
# Feature importances
feature_importances = rf_model.feature_importances_
# Create a pandas series with feature importances
importances = pd.Series(feature_importances, index=X.columns)
# Sort the feature importances in descending order
sorted_importances = importances.sort_values(ascending=False)
|