import numpy as np import os import pandas as pd import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder import shutil import os def make_dir(directory): ''' Creates a new blank directory Inputs: directory: path to create a new directory at Returns: ''' if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) else: os.makedirs(directory) def read_parquet_folder(folder_path): ''' Creates the pandas dataframe from a folder of parquet files Inputs: folder_path: the folder path for the parquet files Returns: ''' dataframes = [] for file in os.listdir(folder_path): if file.endswith('.parquet'): file_path = os.path.join(folder_path, file) df = pd.read_parquet(file_path) dataframes.append(df) return pd.concat(dataframes, ignore_index=True) def create_ids(df, col, name): ''' Creates unique ids for the features and creates mapping documents Inputs: df: dataframe with the features col: column to create ids on name: name of the newly created id Returns: df: dataframe with the mapped ids ''' value_to_id = {val: i for i, val in enumerate(df[col].unique())} df[f'{name}_id'] = df[col].map(value_to_id) df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv') return df if __name__ == '__main__': folder_path = os.getcwd() + '/data/raw/data' df = read_parquet_folder(folder_path) directory = os.getcwd() + '/data/processed' make_dir(directory) df = create_ids(df, 'artist_name', 'artist') df = create_ids(df, 'pid', 'playlist') df = create_ids(df, 'album_name', 'album') df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique') df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max') df['playlist_songs'] += 1 df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1) value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())} df['artist_album_id'] = df['artist_album'].map(value_to_id) df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv') df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum') encoder = LabelEncoder() encoder.fit(df['track_name']) df['track_id'] = encoder.transform(df['track_name']) df['song_percent'] = df['song_count'] / df['playlist_songs'] df['song_percent'] = 1 / (1 + np.exp(-df['song_percent'])) artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates() artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')