data_path = '/Users/haoxinli/Downloads/mpd.v1/data/'playlist_folder = os.listdir(data_path)## Preparing dataCF_baseline_Train = []CF_baseline_Val = []counter = 0for i in range(100): with open(data_path+playlist_folder[i]) as f: raw_data = json.load(f) playlists = raw_data['playlists'] for playlist in playlists: pid = playlist['pid'] for track in playlist['tracks']: track_uri = track['track_uri'] rand = random.random() if rand < 0.8: CF_baseline_Train.append([pid,track_uri]) else: CF_baseline_Val.append([pid,track_uri]) counter += 1with open("CF_baseline_Train_80000.json", "w") as f: data_json = json.dump(CF_baseline_Train,f) with open("CF_baseline_Val_20000.json", "w") as f: data_json = json.dump(CF_baseline_Val,f)CF_baseline_Train: a list of lists for training, with each inner list being a playlist containing the playlist id and the track uri.
CF_baseline_Val: a list of lists for validation, with each inner list being a playlist containing the playlist id and the track uri.
with open('CF_baseline_Val_20000.json', 'r') as f: data = json.load(f) validation = []playlist_num = data[0][0]songs = []for item in data: playlist_num_new = item[0] if playlist_num_new == playlist_num: songs.append(item[1]) playlist_num = playlist_num_new else: validation.append(songs) songs = [] songs.append(item[1]) playlist_num = playlist_num_new# prepare validation x and yX_val = []y_val = []for i in range(len(validation)): playlist = validation[i] x, y = train_test_split(playlist, train_size = 0.7) X_val.append(x) y_val.append(y)with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "w") as f: data_json = json.dump(X_val,f)with open("/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json", "w") as f: data_json = json.dump(y_val,f)X_val: dataset used for the X during validation, consisted of a list of list containing only the track uri
y_val: dataset used for the ground truth during validation, consisted of a list of list containing only the track uri
## Load datawith open('CF_baseline_Train_40000.json', 'r') as f: data = json.load(f)users = [user for user, song in data]item = [song for user, song in data]## Assign id to each playlist titlesong_per_user = {}for user in users: if user not in song_per_user.keys(): song_per_user[user] = 1 else: song_per_user[user] += 1# print(len(song_per_user))song_per_user: a dictionary of 40,000 entries with playlist id as the key, and the number of songs as the value
pids = {}num = 0for user in song_per_user.keys(): pids[user] = num num += 1pids: a dictionary of 40,000 entries with playlist id as the key, and transformed playlist id as the value
users_transform = []for user in users: users_transform.append(pids[user])users_transform: a list of transformed playlist ids, length 2,657,804, which is the number of user-song interaction in the training data.
## Assign id to each track id# appearance of each trackcount = {}for i in item: if i not in count.keys(): count[i] = 1 else: count[i] = count[i] + 1# print(len(count))count:a length of 401,492 dictionary with track uri as the key and track appearance as the value.
item_id = {}id = 0for i in count.keys(): item_id[i] = id id = id+1# print(len(item_id))item_id: a length of 401,492 dictionary with track uri as the key and transformed track id (e.g. 0, 1, 2...)
item_transformed = []for i in item: item_transformed.append(item_id[i])# print(len(item_transformed))item_transformed: a list of transformed playlist ids, corresponding to users_transformed, with a length of 2,657,804.
ratings = np.ones(len(item_transformed))ratings: an array with 1s that depicts the interaction between users and songs, with a length of 2,657,804.
# prepare the interaction matrixdata = Interactions(np.array(users_transform), np.array(item_transformed), ratings)data: transform the prepared data into Spotlight readable format.
model = ImplicitFactorizationModel(n_iter = 5)model.fit(data, verbose = 1)torch.save(model, 'baseline_model_40000')with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f: y_validation = json.load(f)with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f: recs = json.load(f) recommendation = [list(single_score.keys())[:100] for single_score in recs]for i in range(1000): rec = recommendation[i][:1000] Y = val_Y[i] R_precision_scores.append(R_precision(rec,Y)) NDCG_scores.append(NDCG(rec,Y))def R_precision(recommendation, Y): return len(intersection(recommendation, Y))/len(Y)def NDCG(recommendation, Y): DCG = 0 for i in range(0, len(recommendation)): if recommendation[i] in Y and i == 0: DCG += 1 elif i > 0 and recommendation[i] in Y: DCG += 1/math.log((i+2),2) IDCG = 0 for i in range(0,len(Y)): if i == 0: IDCG += 1 else: IDCG += 1/math.log((i+2),2) return DCG/IDCGnp.mean(R_precision_scores)np.mean(NDCG_socres)song_df = pd.read_csv('test0-99.csv')uri_full = song_df.iloc[:,3].valuesaudio_dict_full = {uri:{} for uri in uri_full}sp = spotipy.Spotify(client_credentials_manager = token,requests_timeout=10)start_time = datetime.datetime.now()batch = 50for k in tqdm(range(0,math.floor(len(uri_full)/batch))): uris = [uri_full[i] for i in range(batch*k, min(batch*(k+1),len(uri_full)))] extracted = sp.audio_features(uris) for i in range(50): try: for item in ['danceability','energy','key','loudness','mode', 'speechiness','acousticness','instrumentalness', 'liveness','valence','tempo']: audio_dict_full[uri_full[batch*k+i]][item] = extracted[i][item] except TypeError: passend_time = datetime.datetime.now()print(end_time - start_time)audio_features = pd.DataFrame.from_dict(audio_dict_full,orient='index')audio_features_file = 'audio_features.csv'with open(audio_features_file, mode='w') as f: audio_features.to_csv(f)audio_features: a dataframe that includes all the audio features
audio_features_norm = pd.read_csv('/Users/haoxinli/DocumentsLocal/Spotify/MPD_audio_0_99_norm.csv')audio_features_norm = audio_features_norm.iloc[:,1:]audio_features_values = audio_features_norm.iloc[:,:-1].valuestrackid = audio_features_norm['track_id']def hybrid(recommendation, validation): hybrid_rec = [] for i in tqdm(range(len(recommendation))): dictionary = {} rec_playlist=recommendation[i].copy() test_val = validation[i].copy() for song in rec_playlist[:100]: score = 0 song_pos = np.where(trackid == 'spotify:track:' + song)[0] for truth_song in test_val: truth_song_pos = np.where(trackid == 'spotify:track:' + truth_song)[0] try: score = score + cosine_similarity(audio_features_values[truth_song_pos], audio_features_values[song_pos])[0][0] except: pass dictionary[song] = score dictionary = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)} hybrid_rec.append(dictionary) return hybrid_recwith open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f: recommendation = json.load(f)with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "r") as f: validation = json.load(f)hybrid_rec = hybrid(recommendation, validation[:1000])with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'w') as f: json.dump(hybrid_rec, f)hybrid_rec: a list of lists of new recommendations after adjusting for the similarity between songs in the users and the old recommendations
with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f: y_validation = json.load(f)with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'r') as f: hybrid_rec = json.load(f)recommendation = [list(single_score.keys())[:100] for single_score in hybrid_rec]for i in range(1000): rec = recommendation[i][:1000] Y = val_Y[i] R_precision_scores.append(R_precision(hybrid_rec,Y)) NDCG_scores.append(NDCG(hybrid_rec,Y))np.mean(R_precision_scores)np.mean(NDCG_socres)