""" Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/ Jon Reifschneider Brinnae Bent """ import os import pandas as pd import time import torch import numpy as np import pandas as pd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import TensorDataset from sklearn.model_selection import train_test_split class NNColabFiltering(nn.Module): def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range): super().__init__() self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users) self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items) self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations) self.fc2 = nn.Linear(n_activations,1) self.rating_range = rating_range def forward(self, X): # Get embeddings for minibatch embedded_users = self.user_embeddings(X[:,0]) embedded_items = self.item_embeddings(X[:,1]) # Concatenate user and item embeddings embeddings = torch.cat([embedded_users,embedded_items],dim=1) # Pass embeddings through network preds = self.fc1(embeddings) preds = F.relu(preds) preds = self.fc2(preds) # Scale predicted ratings to target-range [low,high] preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0] return preds def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size): ''' Loads the prefetched data from the output dir Inputs: X_train: training data features y_train: training data target X_val: validation data features y_val: validation data targets batch_size: the batch size to use Returns: trainloader: training dataloader valloader: validation dataloader ''' # Convert training and test data to TensorDatasets trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(), torch.from_numpy(np.array(y_train)).float()) valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(), torch.from_numpy(np.array(y_val)).float()) # Create Dataloaders for our training and test data to allow us to iterate over minibatches trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) return trainloader, valloader def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None): ''' Loads the prefetched data from the output dir Inputs: model: the model to train criterion: the criterion to use to train optimizer: the optimizer to use to train dataloaders: the dict of dataloaders to user in the training and validation device: the torch defined cpu/gpu num_epochs: number of epochs to use for training scheduler: the scheduler to use to train for training Returns: costpaths: the loss for each epoch for validation and training ''' model = model.to(device) since = time.time() costpaths = {'train':[],'val':[]} for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'val']: if phase == 'train': model.train() else: model.eval() running_loss = 0.0 index = 0 for (inputs,labels) in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model.forward(inputs).view(-1) loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += np.sqrt(loss.item()) * labels.size(0) print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='') index +=1 if (phase == 'train') and (scheduler is not None): scheduler.step() epoch_loss = running_loss / len(dataloaders[phase].dataset) costpaths[phase].append(epoch_loss) print('\n{} loss: {:.4f}'.format(phase, epoch_loss)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return costpaths if __name__ == '__main__': artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv') X = artists.loc[:,['playlist_id','artist_album_id',]] y = artists.loc[:,'song_percent'] # Split our data into training and test sets X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2) batchsize = 64 trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize) dataloaders = {'train':trainloader, 'val':valloader} n_users = X.loc[:,'playlist_id'].max()+1 n_items = X.loc[:,'artist_album_id'].max()+1 model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.]) criterion = nn.MSELoss() lr=0.001 n_epochs=10 wd=1e-3 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None) # Save the entire model torch.save(model, os.getcwd() + '/models/recommender.pt')