recommendation_module_project / scripts /build_features.py

Added Naive model and comments

6ce6b56 verified over 1 year ago

3.1 kB

	import numpy as np
	import os
	import pandas as pd
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import LabelEncoder
	import shutil
	import os

	def make_dir(directory):
	'''
	Creates a new blank directory

	Inputs:
	directory: path to create a new directory at
	Returns:

	'''
	if os.path.exists(directory):
	shutil.rmtree(directory)
	os.makedirs(directory)
	else:
	os.makedirs(directory)


	def read_parquet_folder(folder_path):
	'''
	Creates the pandas dataframe from a folder of parquet files

	Inputs:
	folder_path: the folder path for the parquet files
	Returns:

	'''
	dataframes = []
	for file in os.listdir(folder_path):
	if file.endswith('.parquet'):
	file_path = os.path.join(folder_path, file)
	df = pd.read_parquet(file_path)
	dataframes.append(df)

	return pd.concat(dataframes, ignore_index=True)


	def create_ids(df, col, name):
	'''
	Creates unique ids for the features and creates mapping documents

	Inputs:
	df: dataframe with the features
	col: column to create ids on
	name: name of the newly created id
	Returns:
	df: dataframe with the mapped ids

	'''
	value_to_id = {val: i for i, val in enumerate(df[col].unique())}

	df[f'{name}_id'] = df[col].map(value_to_id)
	df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')

	return df

	if __name__ == '__main__':
	folder_path = os.getcwd() + '/data/raw/data'
	df = read_parquet_folder(folder_path)

	directory = os.getcwd() + '/data/processed'
	make_dir(directory)

	df = create_ids(df, 'artist_name', 'artist')
	df = create_ids(df, 'pid', 'playlist')
	df = create_ids(df, 'album_name', 'album')

	df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
	df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
	df['playlist_songs'] += 1

	df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
	value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
	df['artist_album_id'] = df['artist_album'].map(value_to_id)

	df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')

	df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')

	encoder = LabelEncoder()
	encoder.fit(df['track_name'])

	df['track_id'] = encoder.transform(df['track_name'])
	df['song_percent'] = df['song_count'] / df['playlist_songs']
	df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))

	artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
	artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')