Classic Collaborative Filtering

Base Model

For the implicit factorization method, we utilized 5% of the whole Million Playlist Dataset, and randomly selected 80% to be the training set, and 20% to be the testing set.

data_path = '/Users/haoxinli/Downloads/mpd.v1/data/'

playlist_folder = os.listdir(data_path)

## Preparing data

CF_baseline_Train = []

CF_baseline_Val = []

counter = 0

for i in range(100):

    with open(data_path+playlist_folder[i]) as f:

        raw_data = json.load(f)

    playlists = raw_data['playlists']

    for playlist in playlists:

        pid = playlist['pid']

        for track in playlist['tracks']:

            track_uri = track['track_uri']

            rand = random.random()

            if rand < 0.8:

                CF_baseline_Train.append([pid,track_uri])

            else:

                CF_baseline_Val.append([pid,track_uri])

    counter += 1

with open("CF_baseline_Train_80000.json", "w") as f:

    data_json = json.dump(CF_baseline_Train,f)

with open("CF_baseline_Val_20000.json", "w") as f:

    data_json = json.dump(CF_baseline_Val,f)

CF_baseline_Train: a list of lists for training, with each inner list being a playlist containing the playlist id and the track uri.

CF_baseline_Val: a list of lists for validation, with each inner list being a playlist containing the playlist id and the track uri.

We chose 1,000 out of the 10,000 playlists in the validation set to test the model.
For each playlist in the validation set, we randomly selected 70% songs for prediction (X) and 30% as the ground truth (y).

with open('CF_baseline_Val_20000.json', 'r') as f:

    data = json.load(f)

validation = []

playlist_num = data[0][0]

songs = []

for item in data:

    playlist_num_new = item[0]

    if playlist_num_new == playlist_num:

        songs.append(item[1])

        playlist_num = playlist_num_new

    else:

        validation.append(songs)

        songs = []

        songs.append(item[1])

        playlist_num = playlist_num_new

# prepare validation x and y

X_val = []

y_val = []

for i in range(len(validation)):

    playlist = validation[i]

    x, y = train_test_split(playlist, train_size = 0.7)

    X_val.append(x)

    y_val.append(y)

with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "w") as f:

    data_json = json.dump(X_val,f)

with open("/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json", "w") as f:

    data_json = json.dump(y_val,f)

X_val: dataset used for the X during validation, consisted of a list of list containing only the track uri

y_val: dataset used for the ground truth during validation, consisted of a list of list containing only the track uri

Prepare the interaction matrix

## Load data

with open('CF_baseline_Train_40000.json', 'r') as f:

    data = json.load(f)

users = [user for user, song in data]

item = [song for user, song in data]

## Assign id to each playlist title

song_per_user = {}

for user in users:

    if user not in song_per_user.keys():

        song_per_user[user] = 1

    else:

        song_per_user[user] += 1

# print(len(song_per_user))

song_per_user: a dictionary of 40,000 entries with playlist id as the key, and the number of songs as the value

pids = {}

num = 0

for user in song_per_user.keys():

    pids[user] = num

    num += 1

pids: a dictionary of 40,000 entries with playlist id as the key, and transformed playlist id as the value

users_transform = []

for user in users:

    users_transform.append(pids[user])

users_transform: a list of transformed playlist ids, length 2,657,804, which is the number of user-song interaction in the training data.

## Assign id to each track id

# appearance of each track

count = {}

for i in item:

    if i not in count.keys():

        count[i] = 1

    else:

        count[i] = count[i] + 1

# print(len(count))

count:a length of 401,492 dictionary with track uri as the key and track appearance as the value.

item_id = {}

id = 0

for i in count.keys():

    item_id[i] = id

    id = id+1

# print(len(item_id))

item_id: a length of 401,492 dictionary with track uri as the key and transformed track id (e.g. 0, 1, 2...)

item_transformed = []

for i in item:

    item_transformed.append(item_id[i])

# print(len(item_transformed))

item_transformed: a list of transformed playlist ids, corresponding to users_transformed, with a length of 2,657,804.

ratings = np.ones(len(item_transformed))

ratings: an array with 1s that depicts the interaction between users and songs, with a length of 2,657,804.

# prepare the interaction matrix

data = Interactions(np.array(users_transform), np.array(item_transformed), ratings)

data: transform the prepared data into Spotlight readable format.

The model ran for 5 epochs

model = ImplicitFactorizationModel(n_iter = 5)

model.fit(data, verbose = 1)

torch.save(model, 'baseline_model_40000')

After the full matrix is determined, for each playlist in the validation set, we compared how similar this playlist is to each playlist in the matrix using cosine similarity. Instead of comparing the new user to all the users in the interaction matrix, we shuffled the matrix for each prediction and sampled 1000 from them for comparison. Then we were able to calculate the weighted score of each song in the dataset and recommend the top 100 scoring songs.
If the playlist is empty (a cold start problem), we recommended the 100 most popular songs in the whole dataset, hoping the new user would find interest in them.

top_500 = pd.read_csv('/Users/haoxinli/DocumentsLocal/Spotify/top_500.csv').iloc[:,1].values

top_100 = {}

for i in range(100):

    track_uri = top_500[i].split(':')[2]

    top_100[track_uri] = i+1

top_100: a dictionaries of the top 100 tracks in the dateset.

scores = []

users = np.array(range(40000))

with open('/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json', 'r') as f:

    validation = json.load(f)

length = 1000

rec = []

for i in tqdm(range(length)):

    playlist=validation[i]

    a = []

    for item in playlist:

        try:

            a.append(item_id[item])

        except:

            pass

    if len(a) > 1:

        rec_songs = {}

        score = np.array([0] * len(model.predict(1)))

        sum = 0

        new_user = np.array(a)

        random.shuffle(users)

        users = users[0:1000]

        for i in users:

            s = model.predict(i, np.array(new_user))

            cos_sim = cosine_similarity(np.array([s]),np.array([new_user]))

            sum_cos = sum_cos + cos_sim

            score = cos_sim * model.predict(i) + score

        score = score / sum_cos

        for index in np.argsort(score)[-1:-101:-1]:

            rec_songs[id_to_track[index]] = score[index]

        rec.append(rec_songs)

    else:

        rec.append(top_100)

rec: a list of lists, with each list representing 100 recommendation songs for the new user

with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'w') as f:

    json.dump(rec, f)

Evaluation using R-precision and NDCG

with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f:

    y_validation = json.load(f)

with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:

    recs = json.load(f)

recommendation = [list(single_score.keys())[:100] for single_score in recs]

for i in range(1000):

    rec = recommendation[i][:1000]

    Y = val_Y[i]

    R_precision_scores.append(R_precision(rec,Y))

    NDCG_scores.append(NDCG(rec,Y))

def R_precision(recommendation, Y):

    return len(intersection(recommendation, Y))/len(Y)

def NDCG(recommendation, Y):

    DCG = 0

    for i in range(0, len(recommendation)):

        if recommendation[i] in Y and i == 0:

            DCG += 1

        elif i > 0 and recommendation[i] in Y:

            DCG += 1/math.log((i+2),2)

    IDCG = 0

    for i in range(0,len(Y)):

        if i == 0:

            IDCG += 1

        else: IDCG += 1/math.log((i+2),2)

    return DCG/IDCG

np.mean(R_precision_scores)

np.mean(NDCG_socres)

Advanced Model

Addition to the base model, which only compares user to user, we came up with another method that utilizes the audio features of each song track. These features include 'danceability', 'energy', 'key','loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence' and 'tempo'.
These features were normalized first. Features such as 'tempo' has very high magnitude, while features like 'instrumentalness' are very small in magnitude
This model re-ranks the recommended songs produced from the base model by comparing the songs in the validation set to the recommended songs one by one, using cosine similarity. The more similar the recommended to the songs in validation set, the higher the ranking the song is in the new recommendation list.

song_df = pd.read_csv('test0-99.csv')

uri_full = song_df.iloc[:,3].values

audio_dict_full = {uri:{} for uri in uri_full}

sp = spotipy.Spotify(client_credentials_manager = token,requests_timeout=10)

start_time = datetime.datetime.now()

batch = 50

for k in tqdm(range(0,math.floor(len(uri_full)/batch))):

    uris = [uri_full[i] for i in range(batch*k, min(batch*(k+1),len(uri_full)))]

    extracted = sp.audio_features(uris)

    for i in range(50):

        try:

            for item in ['danceability','energy','key','loudness','mode',

                        'speechiness','acousticness','instrumentalness',

                         'liveness','valence','tempo']:

                audio_dict_full[uri_full[batch*k+i]][item] = extracted[i][item]

        except TypeError:

            pass

end_time = datetime.datetime.now()

print(end_time - start_time)

audio_features = pd.DataFrame.from_dict(audio_dict_full,orient='index')

audio_features_file = 'audio_features.csv'

with open(audio_features_file, mode='w') as f:

    audio_features.to_csv(f)

audio_features: a dataframe that includes all the audio features

audio_features_norm = pd.read_csv('/Users/haoxinli/DocumentsLocal/Spotify/MPD_audio_0_99_norm.csv')

audio_features_norm = audio_features_norm.iloc[:,1:]

audio_features_values = audio_features_norm.iloc[:,:-1].values

trackid = audio_features_norm['track_id']

def hybrid(recommendation, validation):

    hybrid_rec = []

    for i in tqdm(range(len(recommendation))):

        dictionary = {}

        rec_playlist=recommendation[i].copy()

        test_val = validation[i].copy()

        for song in rec_playlist[:100]:

            score = 0

            song_pos = np.where(trackid == 'spotify:track:' + song)[0]

            for truth_song in test_val:

                truth_song_pos = np.where(trackid == 'spotify:track:' + truth_song)[0]

                try:

                    score = score + cosine_similarity(audio_features_values[truth_song_pos], audio_features_values[song_pos])[0][0]

                except:

                    pass

            dictionary[song] = score

            dictionary = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}

        hybrid_rec.append(dictionary)

    return hybrid_rec

with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:

    recommendation = json.load(f)

with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "r") as f:

    validation = json.load(f)

hybrid_rec = hybrid(recommendation, validation[:1000])

with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'w') as f:

    json.dump(hybrid_rec, f)

hybrid_rec: a list of lists of new recommendations after adjusting for the similarity between songs in the users and the old recommendations

Evaluation using R-Precision and NDCG

with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f:

    y_validation = json.load(f)

with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'r') as f:

    hybrid_rec = json.load(f)

recommendation = [list(single_score.keys())[:100] for single_score in hybrid_rec]

for i in range(1000):

    rec = recommendation[i][:1000]

    Y = val_Y[i]

    R_precision_scores.append(R_precision(hybrid_rec,Y))

    NDCG_scores.append(NDCG(hybrid_rec,Y))

np.mean(R_precision_scores)

np.mean(NDCG_socres)

Results

Page updated

Google Sites

Report abuse