keesephillips's picture
Added Naive model and comments
6ce6b56 verified
"""
Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/
Jon Reifschneider
Brinnae Bent
"""
import os
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
class NNColabFiltering(nn.Module):
def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):
super().__init__()
self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)
self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)
self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)
self.fc2 = nn.Linear(n_activations,1)
self.rating_range = rating_range
def forward(self, X):
# Get embeddings for minibatch
embedded_users = self.user_embeddings(X[:,0])
embedded_items = self.item_embeddings(X[:,1])
# Concatenate user and item embeddings
embeddings = torch.cat([embedded_users,embedded_items],dim=1)
# Pass embeddings through network
preds = self.fc1(embeddings)
preds = F.relu(preds)
preds = self.fc2(preds)
# Scale predicted ratings to target-range [low,high]
preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
return preds
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
'''
Loads the prefetched data from the output dir
Inputs:
X_train: training data features
y_train: training data target
X_val: validation data features
y_val: validation data targets
batch_size: the batch size to use
Returns:
trainloader: training dataloader
valloader: validation dataloader
'''
# Convert training and test data to TensorDatasets
trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
torch.from_numpy(np.array(y_train)).float())
valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
torch.from_numpy(np.array(y_val)).float())
# Create Dataloaders for our training and test data to allow us to iterate over minibatches
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
return trainloader, valloader
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
'''
Loads the prefetched data from the output dir
Inputs:
model: the model to train
criterion: the criterion to use to train
optimizer: the optimizer to use to train
dataloaders: the dict of dataloaders to user in the training and validation
device: the torch defined cpu/gpu
num_epochs: number of epochs to use for training
scheduler: the scheduler to use to train for training
Returns:
costpaths: the loss for each epoch for validation and training
'''
model = model.to(device)
since = time.time()
costpaths = {'train':[],'val':[]}
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train', 'val']:
if phase == 'train':
model.train()
else:
model.eval()
running_loss = 0.0
index = 0
for (inputs,labels) in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
outputs = model.forward(inputs).view(-1)
loss = criterion(outputs, labels)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += np.sqrt(loss.item()) * labels.size(0)
print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
index +=1
if (phase == 'train') and (scheduler is not None):
scheduler.step()
epoch_loss = running_loss / len(dataloaders[phase].dataset)
costpaths[phase].append(epoch_loss)
print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
return costpaths
if __name__ == '__main__':
artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv')
X = artists.loc[:,['playlist_id','artist_album_id',]]
y = artists.loc[:,'song_percent']
# Split our data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)
batchsize = 64
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)
dataloaders = {'train':trainloader, 'val':valloader}
n_users = X.loc[:,'playlist_id'].max()+1
n_items = X.loc[:,'artist_album_id'].max()+1
model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])
criterion = nn.MSELoss()
lr=0.001
n_epochs=10
wd=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
# Save the entire model
torch.save(model, os.getcwd() + '/models/recommender.pt')