recommendation_module_project / scripts /make_dataset.py

Added Naive model and comments

6ce6b56 verified over 1 year ago

3.05 kB

	import os
	import zipfile
	import json
	import pandas as pd
	import pandas as pd
	import shutil
	import os


	cols = [
	'name',
	'pid',
	'num_followers',
	'pos',
	'artist_name',
	'track_name',
	'album_name'
	]


	def copy_file(src, dst):
	'''
	Copies a file from one dir to another

	Inputs:
	src: filepath to use as the soruce
	dst: filepath to copy the file to

	Returns:

	'''
	dst_dir = os.path.dirname(dst)
	if not os.path.exists(dst_dir):
	os.makedirs(dst_dir)

	shutil.copy2(src, dst)

	def unzip_archive(filepath, dir_path):
	'''
	Unzips a zipfile to the dir_path

	Inputs:
	filepath: filepath of the zip file
	dir_path: path to extract the zip file contents to
	Returns:

	'''
	with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
	zip_ref.extractall(dir_path)


	def make_dir(directory):
	'''
	Creates a new blank directory

	Inputs:
	directory: path to create a new directory at
	Returns:

	'''
	if os.path.exists(directory):
	shutil.rmtree(directory)
	os.makedirs(directory)
	else:
	os.makedirs(directory)


	def make_dataset():
	'''
	Creates the directory of parquet files to create the
	dataset with, used parquet to reduce memory load

	Inputs:

	Returns:

	'''
	directory = os.getcwd() + '/data/raw/playlists/data'
	df = pd.DataFrame()
	index = 0

	for filename in os.listdir(directory):
	if os.path.isfile(os.path.join(directory, filename)):
	if filename.find('.json') != -1 :
	index += 1

	print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')

	full_path = os.path.join(directory, filename)

	with open(full_path, 'r') as file:
	json_data = json.load(file)

	temp = pd.DataFrame(json_data['playlists'])
	expanded_df = temp.explode('tracks').reset_index(drop=True)
	json_normalized = pd.json_normalize(expanded_df['tracks'])

	result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
	result = result[cols]

	df = pd.concat([df, result], axis=0, ignore_index=True)

	if index % 50 == 0:
	df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
	del df
	df = pd.DataFrame()
	if index % 200 == 0:
	break


	if __name__ == '__main__':
	unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
	directory = os.getcwd() + '/data/raw/data'
	make_dir(directory)
	directory = os.getcwd() + '/data/processed'
	make_dir(directory)
	make_dataset()