keesephillips's picture
Added Naive model and comments
6ce6b56 verified
import os
import zipfile
import json
import pandas as pd
import pandas as pd
import shutil
import os
cols = [
'name',
'pid',
'num_followers',
'pos',
'artist_name',
'track_name',
'album_name'
]
def copy_file(src, dst):
'''
Copies a file from one dir to another
Inputs:
src: filepath to use as the soruce
dst: filepath to copy the file to
Returns:
'''
dst_dir = os.path.dirname(dst)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
shutil.copy2(src, dst)
def unzip_archive(filepath, dir_path):
'''
Unzips a zipfile to the dir_path
Inputs:
filepath: filepath of the zip file
dir_path: path to extract the zip file contents to
Returns:
'''
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
zip_ref.extractall(dir_path)
def make_dir(directory):
'''
Creates a new blank directory
Inputs:
directory: path to create a new directory at
Returns:
'''
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def make_dataset():
'''
Creates the directory of parquet files to create the
dataset with, used parquet to reduce memory load
Inputs:
Returns:
'''
directory = os.getcwd() + '/data/raw/playlists/data'
df = pd.DataFrame()
index = 0
for filename in os.listdir(directory):
if os.path.isfile(os.path.join(directory, filename)):
if filename.find('.json') != -1 :
index += 1
print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
full_path = os.path.join(directory, filename)
with open(full_path, 'r') as file:
json_data = json.load(file)
temp = pd.DataFrame(json_data['playlists'])
expanded_df = temp.explode('tracks').reset_index(drop=True)
json_normalized = pd.json_normalize(expanded_df['tracks'])
result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
result = result[cols]
df = pd.concat([df, result], axis=0, ignore_index=True)
if index % 50 == 0:
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
del df
df = pd.DataFrame()
if index % 200 == 0:
break
if __name__ == '__main__':
unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
directory = os.getcwd() + '/data/raw/data'
make_dir(directory)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
make_dataset()