import os import zipfile import json import pandas as pd import pandas as pd import shutil import os cols = [ 'name', 'pid', 'num_followers', 'pos', 'artist_name', 'track_name', 'album_name' ] def copy_file(src, dst): ''' Copies a file from one dir to another Inputs: src: filepath to use as the soruce dst: filepath to copy the file to Returns: ''' dst_dir = os.path.dirname(dst) if not os.path.exists(dst_dir): os.makedirs(dst_dir) shutil.copy2(src, dst) def unzip_archive(filepath, dir_path): ''' Unzips a zipfile to the dir_path Inputs: filepath: filepath of the zip file dir_path: path to extract the zip file contents to Returns: ''' with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref: zip_ref.extractall(dir_path) def make_dir(directory): ''' Creates a new blank directory Inputs: directory: path to create a new directory at Returns: ''' if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) else: os.makedirs(directory) def make_dataset(): ''' Creates the directory of parquet files to create the dataset with, used parquet to reduce memory load Inputs: Returns: ''' directory = os.getcwd() + '/data/raw/playlists/data' df = pd.DataFrame() index = 0 for filename in os.listdir(directory): if os.path.isfile(os.path.join(directory, filename)): if filename.find('.json') != -1 : index += 1 print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='') full_path = os.path.join(directory, filename) with open(full_path, 'r') as file: json_data = json.load(file) temp = pd.DataFrame(json_data['playlists']) expanded_df = temp.explode('tracks').reset_index(drop=True) json_normalized = pd.json_normalize(expanded_df['tracks']) result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1) result = result[cols] df = pd.concat([df, result], axis=0, ignore_index=True) if index % 50 == 0: df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet') del df df = pd.DataFrame() if index % 200 == 0: break if __name__ == '__main__': unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists') directory = os.getcwd() + '/data/raw/data' make_dir(directory) directory = os.getcwd() + '/data/processed' make_dir(directory) make_dataset()