File size: 3,053 Bytes
3284fa6
 
 
 
 
 
 
6ce6b56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3284fa6
 
6ce6b56
 
 
 
 
 
 
 
3284fa6
 
 
 
 
 
 
6ce6b56
 
 
 
 
 
 
 
 
 
 
 
 
3284fa6
6ce6b56
 
 
 
3284fa6
6ce6b56
3284fa6
6ce6b56
3284fa6
6ce6b56
 
3284fa6
6ce6b56
 
 
3284fa6
6ce6b56
 
3284fa6
6ce6b56
3284fa6
6ce6b56
 
 
 
 
 
 
3284fa6
6ce6b56
 
 
 
 
 
 
3284fa6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import zipfile
import json
import pandas as pd
import pandas as pd
import shutil
import os


cols = [
    'name',
    'pid',
    'num_followers',
    'pos',
    'artist_name',
    'track_name',
    'album_name'
]


def copy_file(src, dst):
    '''

    Copies a file from one dir to another 

    

    Inputs:

        src: filepath to use as the soruce

        dst: filepath to copy the file to



    Returns:

    

    '''
    dst_dir = os.path.dirname(dst)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)

    shutil.copy2(src, dst)

def unzip_archive(filepath, dir_path):
    '''

    Unzips a zipfile to the dir_path

    

    Inputs:

        filepath: filepath of the zip file

        dir_path: path to extract the zip file contents to 

    Returns:

    

    '''
    with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
        zip_ref.extractall(dir_path)


def make_dir(directory):
    '''

    Creates a new blank directory

    

    Inputs:

        directory: path to create a new directory at

    Returns:

    

    '''
    if os.path.exists(directory):
        shutil.rmtree(directory)
        os.makedirs(directory)
    else:
        os.makedirs(directory)


def make_dataset():
    '''

    Creates the directory of parquet files to create the 

    dataset with, used parquet to reduce memory load

    

    Inputs:

        

    Returns:

    

    '''
    directory = os.getcwd() + '/data/raw/playlists/data'
    df = pd.DataFrame()
    index = 0

    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            if filename.find('.json') != -1 :
                index += 1

                print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')

                full_path = os.path.join(directory, filename)

                with open(full_path, 'r') as file:
                    json_data = json.load(file)

                temp = pd.DataFrame(json_data['playlists'])
                expanded_df = temp.explode('tracks').reset_index(drop=True)
                json_normalized = pd.json_normalize(expanded_df['tracks'])

                result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
                result = result[cols]

                df = pd.concat([df, result], axis=0, ignore_index=True)

                if index % 50 == 0:
                    df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
                    del df
                    df = pd.DataFrame()
                    if index % 200 == 0:
                        break
                    

if __name__ == '__main__':
    unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
    directory = os.getcwd() + '/data/raw/data'
    make_dir(directory)
    directory = os.getcwd() + '/data/processed'
    make_dir(directory)
    make_dataset()