data_path = '/Users/haoxinli/Downloads/mpd.v1/data/'
playlist_folder = os.listdir(data_path)
## Preparing data
CF_baseline_Train = []
CF_baseline_Val = []
counter = 0
for i in range(100):
with open(data_path+playlist_folder[i]) as f:
raw_data = json.load(f)
playlists = raw_data['playlists']
for playlist in playlists:
pid = playlist['pid']
for track in playlist['tracks']:
track_uri = track['track_uri']
rand = random.random()
if rand < 0.8:
CF_baseline_Train.append([pid,track_uri])
else:
CF_baseline_Val.append([pid,track_uri])
counter += 1
with open("CF_baseline_Train_80000.json", "w") as f:
data_json = json.dump(CF_baseline_Train,f)
with open("CF_baseline_Val_20000.json", "w") as f:
data_json = json.dump(CF_baseline_Val,f)
CF_baseline_Train:
a list of lists for training, with each inner list being a playlist containing the playlist id and the track uri.
CF_baseline_Val:
a list of lists for validation, with each inner list being a playlist containing the playlist id and the track uri.
with open('CF_baseline_Val_20000.json', 'r') as f:
data = json.load(f)
validation = []
playlist_num = data[0][0]
songs = []
for item in data:
playlist_num_new = item[0]
if playlist_num_new == playlist_num:
songs.append(item[1])
playlist_num = playlist_num_new
else:
validation.append(songs)
songs = []
songs.append(item[1])
playlist_num = playlist_num_new
# prepare validation x and y
X_val = []
y_val = []
for i in range(len(validation)):
playlist = validation[i]
x, y = train_test_split(playlist, train_size = 0.7)
X_val.append(x)
y_val.append(y)
with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "w") as f:
data_json = json.dump(X_val,f)
with open("/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json", "w") as f:
data_json = json.dump(y_val,f)
X_val:
dataset used for the X during validation, consisted of a list of list containing only the track uri
y_val:
dataset used for the ground truth during validation, consisted of a list of list containing only the track uri
## Load data
with open('CF_baseline_Train_40000.json', 'r') as f:
data = json.load(f)
users = [user for user, song in data]
item = [song for user, song in data]
## Assign id to each playlist title
song_per_user = {}
for user in users:
if user not in song_per_user.keys():
song_per_user[user] = 1
else:
song_per_user[user] += 1
# print(len(song_per_user))
song_per_user
: a dictionary of 40,000 entries with playlist id as the key, and the number of songs as the value
pids = {}
num = 0
for user in song_per_user.keys():
pids[user] = num
num += 1
pids
: a dictionary of 40,000 entries with playlist id as the key, and transformed playlist id as the value
users_transform = []
for user in users:
users_transform.append(pids[user])
users_transform
: a list of transformed playlist ids, length 2,657,804, which is the number of user-song interaction in the training data.
## Assign id to each track id
# appearance of each track
count = {}
for i in item:
if i not in count.keys():
count[i] = 1
else:
count[i] = count[i] + 1
# print(len(count))
count
:a length of 401,492 dictionary with track uri as the key and track appearance as the value.
item_id = {}
id = 0
for i in count.keys():
item_id[i] = id
id = id+1
# print(len(item_id))
item_id
: a length of 401,492 dictionary with track uri as the key and transformed track id (e.g. 0, 1, 2...)
item_transformed = []
for i in item:
item_transformed.append(item_id[i])
# print(len(item_transformed))
item_transformed
: a list of transformed playlist ids, corresponding to users_transformed
, with a length of 2,657,804.
ratings = np.ones(len(item_transformed))
ratings
: an array with 1s that depicts the interaction between users and songs, with a length of 2,657,804.
# prepare the interaction matrix
data = Interactions(np.array(users_transform), np.array(item_transformed), ratings)
data
: transform the prepared data into Spotlight readable format.
model = ImplicitFactorizationModel(n_iter = 5)
model.fit(data, verbose = 1)
torch.save(model, 'baseline_model_40000')
with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f:
y_validation = json.load(f)
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:
recs = json.load(f)
recommendation = [list(single_score.keys())[:100] for single_score in recs]
for i in range(1000):
rec = recommendation[i][:1000]
Y = val_Y[i]
R_precision_scores.append(R_precision(rec,Y))
NDCG_scores.append(NDCG(rec,Y))
def R_precision(recommendation, Y):
return len(intersection(recommendation, Y))/len(Y)
def NDCG(recommendation, Y):
DCG = 0
for i in range(0, len(recommendation)):
if recommendation[i] in Y and i == 0:
DCG += 1
elif i > 0 and recommendation[i] in Y:
DCG += 1/math.log((i+2),2)
IDCG = 0
for i in range(0,len(Y)):
if i == 0:
IDCG += 1
else: IDCG += 1/math.log((i+2),2)
return DCG/IDCG
np.mean(R_precision_scores)
np.mean(NDCG_socres)
song_df = pd.read_csv('test0-99.csv')
uri_full = song_df.iloc[:,3].values
audio_dict_full = {uri:{} for uri in uri_full}
sp = spotipy.Spotify(client_credentials_manager = token,requests_timeout=10)
start_time = datetime.datetime.now()
batch = 50
for k in tqdm(range(0,math.floor(len(uri_full)/batch))):
uris = [uri_full[i] for i in range(batch*k, min(batch*(k+1),len(uri_full)))]
extracted = sp.audio_features(uris)
for i in range(50):
try:
for item in ['danceability','energy','key','loudness','mode',
'speechiness','acousticness','instrumentalness',
'liveness','valence','tempo']:
audio_dict_full[uri_full[batch*k+i]][item] = extracted[i][item]
except TypeError:
pass
end_time = datetime.datetime.now()
print(end_time - start_time)
audio_features = pd.DataFrame.from_dict(audio_dict_full,orient='index')
audio_features_file = 'audio_features.csv'
with open(audio_features_file, mode='w') as f:
audio_features.to_csv(f)
audio_features
: a dataframe that includes all the audio features
audio_features_norm = pd.read_csv('/Users/haoxinli/DocumentsLocal/Spotify/MPD_audio_0_99_norm.csv')
audio_features_norm = audio_features_norm.iloc[:,1:]
audio_features_values = audio_features_norm.iloc[:,:-1].values
trackid = audio_features_norm['track_id']
def hybrid(recommendation, validation):
hybrid_rec = []
for i in tqdm(range(len(recommendation))):
dictionary = {}
rec_playlist=recommendation[i].copy()
test_val = validation[i].copy()
for song in rec_playlist[:100]:
score = 0
song_pos = np.where(trackid == 'spotify:track:' + song)[0]
for truth_song in test_val:
truth_song_pos = np.where(trackid == 'spotify:track:' + truth_song)[0]
try:
score = score + cosine_similarity(audio_features_values[truth_song_pos], audio_features_values[song_pos])[0][0]
except:
pass
dictionary[song] = score
dictionary = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}
hybrid_rec.append(dictionary)
return hybrid_rec
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:
recommendation = json.load(f)
with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "r") as f:
validation = json.load(f)
hybrid_rec = hybrid(recommendation, validation[:1000])
with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'w') as f:
json.dump(hybrid_rec, f)
hybrid_rec
: a list of lists of new recommendations after adjusting for the similarity between songs in the users and the old recommendations
with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f:
y_validation = json.load(f)
with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'r') as f:
hybrid_rec = json.load(f)
recommendation = [list(single_score.keys())[:100] for single_score in hybrid_rec]
for i in range(1000):
rec = recommendation[i][:1000]
Y = val_Y[i]
R_precision_scores.append(R_precision(hybrid_rec,Y))
NDCG_scores.append(NDCG(hybrid_rec,Y))
np.mean(R_precision_scores)
np.mean(NDCG_socres)