| import os | |
| import zipfile | |
| import json | |
| import pandas as pd | |
| import pandas as pd | |
| import shutil | |
| import os | |
| cols = [ | |
| 'name', | |
| 'pid', | |
| 'num_followers', | |
| 'pos', | |
| 'artist_name', | |
| 'track_name', | |
| 'album_name' | |
| ] | |
| def copy_file(src, dst): | |
| ''' | |
| Copies a file from one dir to another | |
| Inputs: | |
| src: filepath to use as the soruce | |
| dst: filepath to copy the file to | |
| Returns: | |
| ''' | |
| dst_dir = os.path.dirname(dst) | |
| if not os.path.exists(dst_dir): | |
| os.makedirs(dst_dir) | |
| shutil.copy2(src, dst) | |
| def unzip_archive(filepath, dir_path): | |
| ''' | |
| Unzips a zipfile to the dir_path | |
| Inputs: | |
| filepath: filepath of the zip file | |
| dir_path: path to extract the zip file contents to | |
| Returns: | |
| ''' | |
| with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref: | |
| zip_ref.extractall(dir_path) | |
| def make_dir(directory): | |
| ''' | |
| Creates a new blank directory | |
| Inputs: | |
| directory: path to create a new directory at | |
| Returns: | |
| ''' | |
| if os.path.exists(directory): | |
| shutil.rmtree(directory) | |
| os.makedirs(directory) | |
| else: | |
| os.makedirs(directory) | |
| def make_dataset(): | |
| ''' | |
| Creates the directory of parquet files to create the | |
| dataset with, used parquet to reduce memory load | |
| Inputs: | |
| Returns: | |
| ''' | |
| directory = os.getcwd() + '/data/raw/playlists/data' | |
| df = pd.DataFrame() | |
| index = 0 | |
| for filename in os.listdir(directory): | |
| if os.path.isfile(os.path.join(directory, filename)): | |
| if filename.find('.json') != -1 : | |
| index += 1 | |
| print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='') | |
| full_path = os.path.join(directory, filename) | |
| with open(full_path, 'r') as file: | |
| json_data = json.load(file) | |
| temp = pd.DataFrame(json_data['playlists']) | |
| expanded_df = temp.explode('tracks').reset_index(drop=True) | |
| json_normalized = pd.json_normalize(expanded_df['tracks']) | |
| result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1) | |
| result = result[cols] | |
| df = pd.concat([df, result], axis=0, ignore_index=True) | |
| if index % 50 == 0: | |
| df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet') | |
| del df | |
| df = pd.DataFrame() | |
| if index % 200 == 0: | |
| break | |
| if __name__ == '__main__': | |
| unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists') | |
| directory = os.getcwd() + '/data/raw/data' | |
| make_dir(directory) | |
| directory = os.getcwd() + '/data/processed' | |
| make_dir(directory) | |
| make_dataset() | |