| import numpy as np | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.preprocessing import LabelEncoder | |
| import shutil | |
| import os | |
| def make_dir(directory): | |
| ''' | |
| Creates a new blank directory | |
| Inputs: | |
| directory: path to create a new directory at | |
| Returns: | |
| ''' | |
| if os.path.exists(directory): | |
| shutil.rmtree(directory) | |
| os.makedirs(directory) | |
| else: | |
| os.makedirs(directory) | |
| def read_parquet_folder(folder_path): | |
| ''' | |
| Creates the pandas dataframe from a folder of parquet files | |
| Inputs: | |
| folder_path: the folder path for the parquet files | |
| Returns: | |
| ''' | |
| dataframes = [] | |
| for file in os.listdir(folder_path): | |
| if file.endswith('.parquet'): | |
| file_path = os.path.join(folder_path, file) | |
| df = pd.read_parquet(file_path) | |
| dataframes.append(df) | |
| return pd.concat(dataframes, ignore_index=True) | |
| def create_ids(df, col, name): | |
| ''' | |
| Creates unique ids for the features and creates mapping documents | |
| Inputs: | |
| df: dataframe with the features | |
| col: column to create ids on | |
| name: name of the newly created id | |
| Returns: | |
| df: dataframe with the mapped ids | |
| ''' | |
| value_to_id = {val: i for i, val in enumerate(df[col].unique())} | |
| df[f'{name}_id'] = df[col].map(value_to_id) | |
| df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv') | |
| return df | |
| if __name__ == '__main__': | |
| folder_path = os.getcwd() + '/data/raw/data' | |
| df = read_parquet_folder(folder_path) | |
| directory = os.getcwd() + '/data/processed' | |
| make_dir(directory) | |
| df = create_ids(df, 'artist_name', 'artist') | |
| df = create_ids(df, 'pid', 'playlist') | |
| df = create_ids(df, 'album_name', 'album') | |
| df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique') | |
| df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max') | |
| df['playlist_songs'] += 1 | |
| df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1) | |
| value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())} | |
| df['artist_album_id'] = df['artist_album'].map(value_to_id) | |
| df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv') | |
| df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum') | |
| encoder = LabelEncoder() | |
| encoder.fit(df['track_name']) | |
| df['track_id'] = encoder.transform(df['track_name']) | |
| df['song_percent'] = df['song_count'] / df['playlist_songs'] | |
| df['song_percent'] = 1 / (1 + np.exp(-df['song_percent'])) | |
| artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates() | |
| artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv') | |