File size: 3,459 Bytes
54b5359
 
 
 
 
 
 
ba0be00
ea492b0
a075e08
54b5359
 
d24432a
8547a33
5245643
636f054
ea492b0
636f054
d24432a
54b5359
36ab544
 
 
 
 
 
 
636f054
54b5359
 
67b36cf
36ab544
 
 
 
 
54b5359
 
 
36ab544
888a131
36ab544
636f054
888a131
 
 
36ab544
 
a075e08
d12d016
a075e08
 
 
54b5359
 
636f054
 
36ab544
54b5359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm.auto import tqdm
import streamlit as st
from huggingface_hub import Repository, HfApi, HfFolder
import os
import random
tqdm.pandas()

api = HfApi()
token = os.getenv("token")  # Das Token wird aus den Hugging Face Secrets abgerufen
tokenread = os.getenv("tokenread")
localdir = "SpotifyHitPrediction"

repo = Repository(local_dir=localdir, clone_from="https://huggingface.co/spaces/Add1E/SpotifyHitPrediction", token=token)


def remove_non_utf8_characters(text):
    # Encode using UTF-8, ignore errors
    encoded_string = text.encode('utf-8', 'remove')
    # Decode back to string
    return encoded_string.decode('utf-8')


def predict_popularity(features,trainset):
    predictions = [None] * 2
    predictions[0], predictions[1] = rf_model.predict([features]), model.predict([features])
    old_df = pd.read_csv(f'{localdir}/top50.csv', encoding='utf-8')
    addToCsvAndTrain(trainset, old_df)
    st.write("Regression :")
    st.code(f"MeanSquaredError: {mse}, rSqared: {r2}")
    st.write("Random Forest :")
    st.code(f"MeanSquaredError: {rf_mse}, rSqared: {rf_r2}")
    return predictions


def addToCsvAndTrain(trainset, old_df):
    trainset = [
        [trainset[0], remove_non_utf8_characters(trainset[1]), remove_non_utf8_characters(trainset[2]), trainset[3], trainset[4], trainset[5], trainset[6], trainset[7],
         trainset[8], trainset[9], trainset[10], trainset[11], trainset[12], trainset[13]
         ]
    ]
    neues_df = pd.DataFrame(trainset, columns= data.columns)
    df = pd.concat([old_df, neues_df], ignore_index=True)
    df.to_csv(f'{localdir}/top50.csv', index=False, encoding='utf-8')
    if(random.randint(1, 10) == 7):
        st.session_state['reset'] = 1
        repo.git_add(os.path.abspath(f'{localdir}/top50.csv'))
        repo.git_commit("Add top50.csv")
        repo.git_push()




data = pd.read_csv('top50.csv', encoding='utf-8')
print(data.head())

# Let's also describe the data to get a sense of the distributions
print(data.describe())
# Selecting the features and the target variable
X = data.drop(['Unnamed: 0', 'Track.Name', 'Artist.Name', 'Genre', 'Popularity'], axis=1)
y = data['Popularity']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Linear Regression model
model = LinearRegression()

# Fitting the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating the performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fitting the model
rf_model.fit(X_train, y_train)

# Making predictions
rf_pred = rf_model.predict(X_test)

# Calculating the performance metrics
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)


# Feature importances
feature_importances = rf_model.feature_importances_

# Create a pandas series with feature importances
importances = pd.Series(feature_importances, index=X.columns)

# Sort the feature importances in descending order
sorted_importances = importances.sort_values(ascending=False)