File size: 3,053 Bytes

import os
import zipfile
import json
import pandas as pd
import pandas as pd
import shutil
import os


cols = [
    'name',
    'pid',
    'num_followers',
    'pos',
    'artist_name',
    'track_name',
    'album_name'
]


def copy_file(src, dst):
    '''

    Copies a file from one dir to another 

    

    Inputs:

        src: filepath to use as the soruce

        dst: filepath to copy the file to



    Returns:

    

    '''
    dst_dir = os.path.dirname(dst)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    shutil.copy2(src, dst)

def unzip_archive(filepath, dir_path):
    '''

    Unzips a zipfile to the dir_path

    

    Inputs:

        filepath: filepath of the zip file

        dir_path: path to extract the zip file contents to 

    Returns:

    

    '''
    with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
        zip_ref.extractall(dir_path)


def make_dir(directory):
    '''

    Creates a new blank directory

    

    Inputs:

        directory: path to create a new directory at

    Returns:

    

    '''
    if os.path.exists(directory):
        shutil.rmtree(directory)
        os.makedirs(directory)
    else:
        os.makedirs(directory)


def make_dataset():
    '''

    Creates the directory of parquet files to create the 

    dataset with, used parquet to reduce memory load

    

    Inputs:

        

    Returns:

    

    '''
    directory = os.getcwd() + '/data/raw/playlists/data'
    df = pd.DataFrame()
    index = 0

    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            if filename.find('.json') != -1 :
                index += 1

                print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')

                full_path = os.path.join(directory, filename)

                with open(full_path, 'r') as file:
                    json_data = json.load(file)

                temp = pd.DataFrame(json_data['playlists'])
                expanded_df = temp.explode('tracks').reset_index(drop=True)
                json_normalized = pd.json_normalize(expanded_df['tracks'])

                result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
                result = result[cols]

                df = pd.concat([df, result], axis=0, ignore_index=True)

                if index % 50 == 0:
                    df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
                    del df
                    df = pd.DataFrame()
                    if index % 200 == 0:
                        break
                    

if __name__ == '__main__':
    unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
    directory = os.getcwd() + '/data/raw/data'
    make_dir(directory)
    directory = os.getcwd() + '/data/processed'
    make_dir(directory)
    make_dataset()