File size: 6,166 Bytes
3284fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ce6b56
3284fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ce6b56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3284fa6
6ce6b56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3284fa6
 
 
 
 
 
 
 
 
 
6ce6b56
3284fa6
6ce6b56
3284fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""

Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/



Jon Reifschneider

Brinnae Bent 



"""

import os
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split


class NNColabFiltering(nn.Module):

    def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)
        self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)
        self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)
        self.fc2 = nn.Linear(n_activations,1)
        self.rating_range = rating_range

    def forward(self, X):
        # Get embeddings for minibatch
        embedded_users = self.user_embeddings(X[:,0])
        embedded_items = self.item_embeddings(X[:,1])
        # Concatenate user and item embeddings
        embeddings = torch.cat([embedded_users,embedded_items],dim=1)
        # Pass embeddings through network
        preds = self.fc1(embeddings)
        preds = F.relu(preds)
        preds = self.fc2(preds)
        # Scale predicted ratings to target-range [low,high]
        preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
        return preds

def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
    '''

    Loads the prefetched data from the output dir



    Inputs:

        X_train: training data features

        y_train: training data target

        X_val: validation data features

        y_val: validation data targets

        batch_size: the batch size to use



    Returns:

        trainloader: training dataloader

        valloader: validation dataloader

    '''
    # Convert training and test data to TensorDatasets
    trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
                            torch.from_numpy(np.array(y_train)).float())
    valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
                            torch.from_numpy(np.array(y_val)).float())

    # Create Dataloaders for our training and test data to allow us to iterate over minibatches
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader

def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
    '''

    Loads the prefetched data from the output dir



    Inputs:

        model: the model to train

        criterion: the criterion to use to train

        optimizer: the optimizer to use to train

        dataloaders: the dict of dataloaders to user in the training and validation 

        device: the torch defined cpu/gpu 

        num_epochs: number of epochs to use for training

        scheduler: the scheduler to use to train for training 



    Returns:

        costpaths: the loss for each epoch for validation and training

    '''
    model = model.to(device) 
    since = time.time()

    costpaths = {'train':[],'val':[]}

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  
            else:
                model.eval()  

            running_loss = 0.0

            index = 0
            for (inputs,labels) in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model.forward(inputs).view(-1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += np.sqrt(loss.item()) * labels.size(0)
                print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
                index +=1

            if (phase == 'train') and (scheduler is not None):
                scheduler.step()

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            costpaths[phase].append(epoch_loss)
            print('\n{} loss: {:.4f}'.format(phase, epoch_loss))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return costpaths


if __name__ == '__main__':
    artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv')
    X = artists.loc[:,['playlist_id','artist_album_id',]]
    y = artists.loc[:,'song_percent']

    # Split our data into training and test sets
    X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)
    batchsize = 64
    trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)

    dataloaders = {'train':trainloader, 'val':valloader}
    n_users = X.loc[:,'playlist_id'].max()+1
    n_items = X.loc[:,'artist_album_id'].max()+1
    model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])
    criterion = nn.MSELoss()
    lr=0.001
    n_epochs=10
    wd=1e-3
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)

    # Save the entire model
    torch.save(model, os.getcwd() + '/models/recommender.pt')