File size: 6,166 Bytes
3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
"""
Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/
Jon Reifschneider
Brinnae Bent
"""
import os
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
class NNColabFiltering(nn.Module):
def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range):
super().__init__()
self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users)
self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items)
self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations)
self.fc2 = nn.Linear(n_activations,1)
self.rating_range = rating_range
def forward(self, X):
# Get embeddings for minibatch
embedded_users = self.user_embeddings(X[:,0])
embedded_items = self.item_embeddings(X[:,1])
# Concatenate user and item embeddings
embeddings = torch.cat([embedded_users,embedded_items],dim=1)
# Pass embeddings through network
preds = self.fc1(embeddings)
preds = F.relu(preds)
preds = self.fc2(preds)
# Scale predicted ratings to target-range [low,high]
preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0]
return preds
def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size):
'''
Loads the prefetched data from the output dir
Inputs:
X_train: training data features
y_train: training data target
X_val: validation data features
y_val: validation data targets
batch_size: the batch size to use
Returns:
trainloader: training dataloader
valloader: validation dataloader
'''
# Convert training and test data to TensorDatasets
trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(),
torch.from_numpy(np.array(y_train)).float())
valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(),
torch.from_numpy(np.array(y_val)).float())
# Create Dataloaders for our training and test data to allow us to iterate over minibatches
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)
return trainloader, valloader
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None):
'''
Loads the prefetched data from the output dir
Inputs:
model: the model to train
criterion: the criterion to use to train
optimizer: the optimizer to use to train
dataloaders: the dict of dataloaders to user in the training and validation
device: the torch defined cpu/gpu
num_epochs: number of epochs to use for training
scheduler: the scheduler to use to train for training
Returns:
costpaths: the loss for each epoch for validation and training
'''
model = model.to(device)
since = time.time()
costpaths = {'train':[],'val':[]}
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train', 'val']:
if phase == 'train':
model.train()
else:
model.eval()
running_loss = 0.0
index = 0
for (inputs,labels) in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
outputs = model.forward(inputs).view(-1)
loss = criterion(outputs, labels)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += np.sqrt(loss.item()) * labels.size(0)
print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='')
index +=1
if (phase == 'train') and (scheduler is not None):
scheduler.step()
epoch_loss = running_loss / len(dataloaders[phase].dataset)
costpaths[phase].append(epoch_loss)
print('\n{} loss: {:.4f}'.format(phase, epoch_loss))
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
return costpaths
if __name__ == '__main__':
artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv')
X = artists.loc[:,['playlist_id','artist_album_id',]]
y = artists.loc[:,'song_percent']
# Split our data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)
batchsize = 64
trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize)
dataloaders = {'train':trainloader, 'val':valloader}
n_users = X.loc[:,'playlist_id'].max()+1
n_items = X.loc[:,'artist_album_id'].max()+1
model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.])
criterion = nn.MSELoss()
lr=0.001
n_epochs=10
wd=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None)
# Save the entire model
torch.save(model, os.getcwd() + '/models/recommender.pt')
|