keesephillips's picture
Added Naive model and comments
6ce6b56 verified
import numpy as np
import os
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import shutil
import os
def make_dir(directory):
'''
Creates a new blank directory
Inputs:
directory: path to create a new directory at
Returns:
'''
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def read_parquet_folder(folder_path):
'''
Creates the pandas dataframe from a folder of parquet files
Inputs:
folder_path: the folder path for the parquet files
Returns:
'''
dataframes = []
for file in os.listdir(folder_path):
if file.endswith('.parquet'):
file_path = os.path.join(folder_path, file)
df = pd.read_parquet(file_path)
dataframes.append(df)
return pd.concat(dataframes, ignore_index=True)
def create_ids(df, col, name):
'''
Creates unique ids for the features and creates mapping documents
Inputs:
df: dataframe with the features
col: column to create ids on
name: name of the newly created id
Returns:
df: dataframe with the mapped ids
'''
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
df[f'{name}_id'] = df[col].map(value_to_id)
df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
return df
if __name__ == '__main__':
folder_path = os.getcwd() + '/data/raw/data'
df = read_parquet_folder(folder_path)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
df = create_ids(df, 'artist_name', 'artist')
df = create_ids(df, 'pid', 'playlist')
df = create_ids(df, 'album_name', 'album')
df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
df['playlist_songs'] += 1
df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
df['artist_album_id'] = df['artist_album'].map(value_to_id)
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
encoder = LabelEncoder()
encoder.fit(df['track_name'])
df['track_id'] = encoder.transform(df['track_name'])
df['song_percent'] = df['song_count'] / df['playlist_songs']
df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')