|
|
"""
|
|
|
Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/
|
|
|
|
|
|
Jon Reifschneider
|
|
|
Brinnae Bent
|
|
|
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import torch
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
from torch.utils.data import TensorDataset
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
class NNColabFiltering(nn.Module):
|
|
|
|
|
|
def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):
|
|
|
super().__init__()
|
|
|
self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)
|
|
|
self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)
|
|
|
self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)
|
|
|
self.fc2 = nn.Linear(n_activations,1)
|
|
|
self.rating_range = rating_range
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
|
|
embedded_users = self.user_embeddings(X[:,0])
|
|
|
embedded_items = self.item_embeddings(X[:,1])
|
|
|
|
|
|
embeddings = torch.cat([embedded_users,embedded_items],dim=1)
|
|
|
|
|
|
preds = self.fc1(embeddings)
|
|
|
preds = F.relu(preds)
|
|
|
preds = self.fc2(preds)
|
|
|
|
|
|
preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
|
|
|
return preds
|
|
|
|
|
|
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
|
|
|
'''
|
|
|
Loads the prefetched data from the output dir
|
|
|
|
|
|
Inputs:
|
|
|
X_train: training data features
|
|
|
y_train: training data target
|
|
|
X_val: validation data features
|
|
|
y_val: validation data targets
|
|
|
batch_size: the batch size to use
|
|
|
|
|
|
Returns:
|
|
|
trainloader: training dataloader
|
|
|
valloader: validation dataloader
|
|
|
'''
|
|
|
|
|
|
trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
|
|
|
torch.from_numpy(np.array(y_train)).float())
|
|
|
valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
|
|
|
torch.from_numpy(np.array(y_val)).float())
|
|
|
|
|
|
|
|
|
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
|
|
|
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
|
|
|
|
|
|
return trainloader, valloader
|
|
|
|
|
|
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
|
|
|
'''
|
|
|
Loads the prefetched data from the output dir
|
|
|
|
|
|
Inputs:
|
|
|
model: the model to train
|
|
|
criterion: the criterion to use to train
|
|
|
optimizer: the optimizer to use to train
|
|
|
dataloaders: the dict of dataloaders to user in the training and validation
|
|
|
device: the torch defined cpu/gpu
|
|
|
num_epochs: number of epochs to use for training
|
|
|
scheduler: the scheduler to use to train for training
|
|
|
|
|
|
Returns:
|
|
|
costpaths: the loss for each epoch for validation and training
|
|
|
'''
|
|
|
model = model.to(device)
|
|
|
since = time.time()
|
|
|
|
|
|
costpaths = {'train':[],'val':[]}
|
|
|
|
|
|
for epoch in range(num_epochs):
|
|
|
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
|
|
|
print('-' * 10)
|
|
|
|
|
|
for phase in ['train', 'val']:
|
|
|
if phase == 'train':
|
|
|
model.train()
|
|
|
else:
|
|
|
model.eval()
|
|
|
|
|
|
running_loss = 0.0
|
|
|
|
|
|
index = 0
|
|
|
for (inputs,labels) in dataloaders[phase]:
|
|
|
inputs = inputs.to(device)
|
|
|
labels = labels.to(device)
|
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
|
with torch.set_grad_enabled(phase == 'train'):
|
|
|
outputs = model.forward(inputs).view(-1)
|
|
|
loss = criterion(outputs, labels)
|
|
|
|
|
|
if phase == 'train':
|
|
|
loss.backward()
|
|
|
optimizer.step()
|
|
|
|
|
|
running_loss += np.sqrt(loss.item()) * labels.size(0)
|
|
|
print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
|
|
|
index +=1
|
|
|
|
|
|
if (phase == 'train') and (scheduler is not None):
|
|
|
scheduler.step()
|
|
|
|
|
|
epoch_loss = running_loss / len(dataloaders[phase].dataset)
|
|
|
costpaths[phase].append(epoch_loss)
|
|
|
print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
|
|
|
|
|
|
time_elapsed = time.time() - since
|
|
|
print('Training complete in {:.0f}m {:.0f}s'.format(
|
|
|
time_elapsed // 60, time_elapsed % 60))
|
|
|
|
|
|
return costpaths
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv')
|
|
|
X = artists.loc[:,['playlist_id','artist_album_id',]]
|
|
|
y = artists.loc[:,'song_percent']
|
|
|
|
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)
|
|
|
batchsize = 64
|
|
|
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)
|
|
|
|
|
|
dataloaders = {'train':trainloader, 'val':valloader}
|
|
|
n_users = X.loc[:,'playlist_id'].max()+1
|
|
|
n_items = X.loc[:,'artist_album_id'].max()+1
|
|
|
model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])
|
|
|
criterion = nn.MSELoss()
|
|
|
lr=0.001
|
|
|
n_epochs=10
|
|
|
wd=1e-3
|
|
|
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
|
|
|
|
|
|
|
|
|
torch.save(model, os.getcwd() + '/models/recommender.pt')
|
|
|
|
|
|
|