File size: 3,095 Bytes
6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 3284fa6 6ce6b56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import numpy as np
import os
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import shutil
import os
def make_dir(directory):
'''
Creates a new blank directory
Inputs:
directory: path to create a new directory at
Returns:
'''
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def read_parquet_folder(folder_path):
'''
Creates the pandas dataframe from a folder of parquet files
Inputs:
folder_path: the folder path for the parquet files
Returns:
'''
dataframes = []
for file in os.listdir(folder_path):
if file.endswith('.parquet'):
file_path = os.path.join(folder_path, file)
df = pd.read_parquet(file_path)
dataframes.append(df)
return pd.concat(dataframes, ignore_index=True)
def create_ids(df, col, name):
'''
Creates unique ids for the features and creates mapping documents
Inputs:
df: dataframe with the features
col: column to create ids on
name: name of the newly created id
Returns:
df: dataframe with the mapped ids
'''
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
df[f'{name}_id'] = df[col].map(value_to_id)
df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
return df
if __name__ == '__main__':
folder_path = os.getcwd() + '/data/raw/data'
df = read_parquet_folder(folder_path)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
df = create_ids(df, 'artist_name', 'artist')
df = create_ids(df, 'pid', 'playlist')
df = create_ids(df, 'album_name', 'album')
df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
df['playlist_songs'] += 1
df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
df['artist_album_id'] = df['artist_album'].map(value_to_id)
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
encoder = LabelEncoder()
encoder.fit(df['track_name'])
df['track_id'] = encoder.transform(df['track_name'])
df['song_percent'] = df['song_count'] / df['playlist_songs']
df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|