{ "*": { "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": { "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_weight_implicit_gemm_kernel": { "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": { "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_kernel": { "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_splitk_kernel": { "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_splitk_kernel": { "(7, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_splitk": { "(2^7, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 }, "(2^17, 256, 256, 27)": { "SPLITK": 1 } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_splitk": { "(2^7, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 }, "(2^17, 256, 256, 27)": { "SPLITK": 8 } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_kernel": { "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_kernel": { "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_kernel": { "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_kernel": { "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_splitk_kernel": { "(16, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_splitk_kernel": { "(18, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 128, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_splitk": { "(2^18, 64, 64, 27)": { "SPLITK": 1 }, "(2^16, 128, 512, 27)": { "SPLITK": 1 }, "(2^16, 128, 128, 27)": { "SPLITK": 1 }, "(2^14, 256, 1024, 27)": { "SPLITK": 2 }, "(2^14, 256, 256, 27)": { "SPLITK": 1 }, "(2^12, 512, 2048, 27)": { "SPLITK": 4 }, "(2^12, 512, 512, 27)": { "SPLITK": 4 }, "(2^10, 1024, 4096, 27)": { "SPLITK": 8 }, "(2^10, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^12, 512, 128, 27)": { "SPLITK": 1 }, "(2^14, 256, 64, 27)": { "SPLITK": 1 }, "(2^16, 128, 32, 27)": { "SPLITK": 1 }, "(2^18, 64, 16, 27)": { "SPLITK": 1 }, "(2^11, 512, 2048, 27)": { "SPLITK": 8 }, "(2^11, 512, 512, 27)": { "SPLITK": 4 }, "(2^9, 1024, 4096, 27)": { "SPLITK": 16 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^11, 512, 128, 27)": { "SPLITK": 1 }, "(2^13, 256, 1024, 27)": { "SPLITK": 4 }, "(2^13, 256, 256, 27)": { "SPLITK": 1 }, "(2^13, 256, 64, 27)": { "SPLITK": 1 }, "(2^17, 64, 64, 27)": { "SPLITK": 1 }, "(2^15, 128, 512, 27)": { "SPLITK": 1 }, "(2^15, 128, 128, 27)": { "SPLITK": 1 }, "(2^15, 128, 32, 27)": { "SPLITK": 1 }, "(2^17, 64, 16, 27)": { "SPLITK": 1 }, "(2^8, 1024, 4096, 27)": { "SPLITK": 16 }, "(2^8, 1024, 1024, 27)": { "SPLITK": 16 }, "(2^19, 64, 64, 27)": { "SPLITK": 1 }, "(2^17, 128, 512, 27)": { "SPLITK": 1 }, "(2^17, 128, 128, 27)": { "SPLITK": 1 }, "(2^15, 256, 1024, 27)": { "SPLITK": 2 }, "(2^15, 256, 256, 27)": { "SPLITK": 1 }, "(2^13, 512, 2048, 27)": { "SPLITK": 4 }, "(2^13, 512, 512, 27)": { "SPLITK": 1 }, "(2^13, 512, 128, 27)": { "SPLITK": 1 }, "(2^15, 256, 64, 27)": { "SPLITK": 1 }, "(2^17, 128, 32, 27)": { "SPLITK": 1 }, "(2^19, 64, 16, 27)": { "SPLITK": 1 }, "(2^11, 1024, 4096, 27)": { "SPLITK": 16 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^10, 512, 2048, 27)": { "SPLITK": 16 }, "(2^10, 512, 512, 27)": { "SPLITK": 4 }, "(2^10, 512, 128, 27)": { "SPLITK": 1 }, "(2^20, 64, 64, 27)": { "SPLITK": 1 }, "(2^20, 64, 16, 27)": { "SPLITK": 1 }, "(2^18, 128, 512, 27)": { "SPLITK": 1 }, "(2^18, 128, 128, 27)": { "SPLITK": 1 }, "(2^18, 128, 32, 27)": { "SPLITK": 1 }, "(2^16, 256, 1024, 27)": { "SPLITK": 1 }, "(2^16, 256, 256, 27)": { "SPLITK": 1 }, "(2^16, 256, 64, 27)": { "SPLITK": 1 }, "(2^12, 256, 1024, 27)": { "SPLITK": 8 }, "(2^12, 256, 256, 27)": { "SPLITK": 1 }, "(2^12, 256, 64, 27)": { "SPLITK": 1 }, "(2^21, 64, 64, 27)": { "SPLITK": 1 }, "(2^21, 64, 16, 27)": { "SPLITK": 1 }, "(2^19, 128, 512, 27)": { "SPLITK": 1 }, "(2^19, 128, 128, 27)": { "SPLITK": 1 }, "(2^17, 256, 1024, 27)": { "SPLITK": 1 }, "(2^17, 256, 256, 27)": { "SPLITK": 1 }, "(2^14, 512, 2048, 27)": { "SPLITK": 4 }, "(2^14, 512, 512, 27)": { "SPLITK": 1 }, "(2^14, 512, 128, 27)": { "SPLITK": 1 }, "(2^17, 256, 64, 27)": { "SPLITK": 1 }, "(2^19, 128, 32, 27)": { "SPLITK": 1 }, "(2^7, 1024, 4096, 27)": { "SPLITK": 32 }, "(2^7, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^14, 128, 512, 27)": { "SPLITK": 4 }, "(2^14, 128, 128, 27)": { "SPLITK": 1 }, "(2^14, 128, 32, 27)": { "SPLITK": 1 }, "(2^16, 64, 64, 27)": { "SPLITK": 1 }, "(2^16, 64, 16, 27)": { "SPLITK": 1 }, "(2^22, 64, 64, 27)": { "SPLITK": 1 }, "(2^22, 64, 16, 27)": { "SPLITK": 1 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_splitk": { "(2^18, 64, 64, 27)": { "SPLITK": 128 }, "(2^16, 128, 512, 27)": { "SPLITK": 16 }, "(2^16, 128, 128, 27)": { "SPLITK": 32 }, "(2^14, 256, 1024, 27)": { "SPLITK": 1 }, "(2^14, 256, 256, 27)": { "SPLITK": 4 }, "(2^12, 512, 2048, 27)": { "SPLITK": 1 }, "(2^12, 512, 512, 27)": { "SPLITK": 1 }, "(2^10, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^10, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^12, 512, 128, 27)": { "SPLITK": 1 }, "(2^14, 256, 64, 27)": { "SPLITK": 16 }, "(2^16, 128, 32, 27)": { "SPLITK": 64 }, "(2^18, 64, 16, 27)": { "SPLITK": 128 }, "(2^11, 512, 2048, 27)": { "SPLITK": 1 }, "(2^11, 512, 512, 27)": { "SPLITK": 1 }, "(2^9, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^11, 512, 128, 27)": { "SPLITK": 1 }, "(2^13, 256, 1024, 27)": { "SPLITK": 1 }, "(2^13, 256, 256, 27)": { "SPLITK": 4 }, "(2^13, 256, 64, 27)": { "SPLITK": 16 }, "(2^17, 64, 64, 27)": { "SPLITK": 64 }, "(2^15, 128, 512, 27)": { "SPLITK": 4 }, "(2^15, 128, 128, 27)": { "SPLITK": 16 }, "(2^15, 128, 32, 27)": { "SPLITK": 32 }, "(2^17, 64, 16, 27)": { "SPLITK": 128 }, "(2^8, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^8, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^19, 64, 64, 27)": { "SPLITK": 128 }, "(2^17, 128, 512, 27)": { "SPLITK": 16 }, "(2^17, 128, 128, 27)": { "SPLITK": 32 }, "(2^15, 256, 1024, 27)": { "SPLITK": 2 }, "(2^15, 256, 256, 27)": { "SPLITK": 8 }, "(2^13, 512, 2048, 27)": { "SPLITK": 1 }, "(2^13, 512, 512, 27)": { "SPLITK": 1 }, "(2^13, 512, 128, 27)": { "SPLITK": 1 }, "(2^15, 256, 64, 27)": { "SPLITK": 8 }, "(2^17, 128, 32, 27)": { "SPLITK": 64 }, "(2^19, 64, 16, 27)": { "SPLITK": 128 }, "(2^11, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^10, 512, 2048, 27)": { "SPLITK": 1 }, "(2^10, 512, 512, 27)": { "SPLITK": 1 }, "(2^10, 512, 128, 27)": { "SPLITK": 1 }, "(2^20, 64, 64, 27)": { "SPLITK": 128 }, "(2^20, 64, 16, 27)": { "SPLITK": 128 }, "(2^18, 128, 512, 27)": { "SPLITK": 32 }, "(2^18, 128, 128, 27)": { "SPLITK": 64 }, "(2^18, 128, 32, 27)": { "SPLITK": 64 }, "(2^16, 256, 1024, 27)": { "SPLITK": 4 }, "(2^16, 256, 256, 27)": { "SPLITK": 4 }, "(2^16, 256, 64, 27)": { "SPLITK": 8 }, "(2^12, 256, 1024, 27)": { "SPLITK": 1 }, "(2^12, 256, 256, 27)": { "SPLITK": 1 }, "(2^12, 256, 64, 27)": { "SPLITK": 8 }, "(2^21, 64, 64, 27)": { "SPLITK": 128 }, "(2^21, 64, 16, 27)": { "SPLITK": 128 }, "(2^19, 128, 512, 27)": { "SPLITK": 32 }, "(2^19, 128, 128, 27)": { "SPLITK": 128 }, "(2^17, 256, 1024, 27)": { "SPLITK": 8 }, "(2^17, 256, 256, 27)": { "SPLITK": 8 }, "(2^14, 512, 2048, 27)": { "SPLITK": 2 }, "(2^14, 512, 512, 27)": { "SPLITK": 1 }, "(2^14, 512, 128, 27)": { "SPLITK": 4 }, "(2^17, 256, 64, 27)": { "SPLITK": 32 }, "(2^19, 128, 32, 27)": { "SPLITK": 128 }, "(2^7, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^7, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^14, 128, 512, 27)": { "SPLITK": 4 }, "(2^14, 128, 128, 27)": { "SPLITK": 16 }, "(2^14, 128, 32, 27)": { "SPLITK": 32 }, "(2^16, 64, 64, 27)": { "SPLITK": 32 }, "(2^16, 64, 16, 27)": { "SPLITK": 64 }, "(2^22, 64, 64, 27)": { "SPLITK": 8 }, "(2^22, 64, 16, 27)": { "SPLITK": 128 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm.sparse_submanifold_conv_fwd_implicit_gemm_kernel": { "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_kernel": { "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 128, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_splitk_kernel": { "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 16, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_splitk": { "(2^8, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^10, 1024, 1024, 27)": { "SPLITK": 2 }, "(2^12, 1024, 1024, 27)": { "SPLITK": 2 }, "(2^14, 512, 512, 27)": { "SPLITK": 1 }, "(2^16, 256, 256, 27)": { "SPLITK": 1 }, "(2^18, 128, 128, 27)": { "SPLITK": 1 }, "(2^20, 64, 64, 27)": { "SPLITK": 1 }, "(2^7, 1024, 1024, 27)": { "SPLITK": 16 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 }, "(2^17, 256, 256, 27)": { "SPLITK": 1 }, "(2^19, 128, 128, 27)": { "SPLITK": 1 } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm.sparse_submanifold_conv_fwd_masked_implicit_gemm_kernel": { "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_kernel": { "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(21, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 32 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(19, 128, 128, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(18, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(20, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 64, "B2": 256, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk_kernel": { "(16, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 256, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 256, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 256, "BK": 64 }, "num_warps": 8, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(16, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(17, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(14, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(12, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(10, 256, 64, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 512, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 512, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(8, 512, 128, 27, 1024, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(15, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 512, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 512, 128, 27, 512, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(13, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 256, "B2": 64, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 128, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 256, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 256, 64, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(6, 1024, 1024, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 64 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(7, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 64, "B2": 64, "BK": 64 }, "num_warps": 2, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { "kwargs": { "B1": 128, "B2": 128, "BK": 32 }, "num_warps": 4, "num_ctas": 1, "num_stages": 4, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } }, "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk": { "(2^18, 64, 64, 27)": { "SPLITK": 1 }, "(2^18, 64, 16, 27)": { "SPLITK": 1 }, "(2^16, 128, 128, 27)": { "SPLITK": 1 }, "(2^16, 128, 32, 27)": { "SPLITK": 1 }, "(2^14, 256, 256, 27)": { "SPLITK": 1 }, "(2^14, 256, 64, 27)": { "SPLITK": 1 }, "(2^12, 512, 512, 27)": { "SPLITK": 1 }, "(2^12, 512, 128, 27)": { "SPLITK": 16 }, "(2^10, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^10, 1024, 4096, 27)": { "SPLITK": 4 }, "(2^12, 512, 2048, 27)": { "SPLITK": 1 }, "(2^14, 256, 1024, 27)": { "SPLITK": 1 }, "(2^16, 128, 512, 27)": { "SPLITK": 1 }, "(2^19, 64, 64, 27)": { "SPLITK": 1 }, "(2^19, 64, 16, 27)": { "SPLITK": 1 }, "(2^17, 128, 128, 27)": { "SPLITK": 1 }, "(2^17, 128, 32, 27)": { "SPLITK": 1 }, "(2^15, 256, 256, 27)": { "SPLITK": 1 }, "(2^15, 256, 64, 27)": { "SPLITK": 1 }, "(2^13, 512, 512, 27)": { "SPLITK": 2 }, "(2^13, 512, 128, 27)": { "SPLITK": 1 }, "(2^11, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^11, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^13, 512, 2048, 27)": { "SPLITK": 1 }, "(2^15, 256, 1024, 27)": { "SPLITK": 1 }, "(2^17, 128, 512, 27)": { "SPLITK": 1 }, "(2^11, 512, 512, 27)": { "SPLITK": 4 }, "(2^11, 512, 128, 27)": { "SPLITK": 16 }, "(2^9, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^9, 1024, 4096, 27)": { "SPLITK": 8 }, "(2^11, 512, 2048, 27)": { "SPLITK": 1 }, "(2^13, 256, 256, 27)": { "SPLITK": 1 }, "(2^13, 256, 64, 27)": { "SPLITK": 1 }, "(2^13, 256, 1024, 27)": { "SPLITK": 1 }, "(2^17, 64, 64, 27)": { "SPLITK": 1 }, "(2^17, 64, 16, 27)": { "SPLITK": 1 }, "(2^15, 128, 128, 27)": { "SPLITK": 1 }, "(2^15, 128, 32, 27)": { "SPLITK": 1 }, "(2^15, 128, 512, 27)": { "SPLITK": 1 }, "(2^8, 1024, 1024, 27)": { "SPLITK": 16 }, "(2^8, 1024, 4096, 27)": { "SPLITK": 4 }, "(2^10, 512, 512, 27)": { "SPLITK": 1 }, "(2^10, 512, 128, 27)": { "SPLITK": 32 }, "(2^10, 512, 2048, 27)": { "SPLITK": 1 }, "(2^20, 64, 64, 27)": { "SPLITK": 1 }, "(2^20, 64, 16, 27)": { "SPLITK": 1 }, "(2^18, 128, 128, 27)": { "SPLITK": 1 }, "(2^18, 128, 32, 27)": { "SPLITK": 1 }, "(2^18, 128, 512, 27)": { "SPLITK": 1 }, "(2^16, 256, 256, 27)": { "SPLITK": 1 }, "(2^16, 256, 64, 27)": { "SPLITK": 1 }, "(2^16, 256, 1024, 27)": { "SPLITK": 1 }, "(2^12, 256, 256, 27)": { "SPLITK": 1 }, "(2^12, 256, 64, 27)": { "SPLITK": 1 }, "(2^12, 256, 1024, 27)": { "SPLITK": 1 }, "(2^21, 64, 64, 27)": { "SPLITK": 1 }, "(2^21, 64, 16, 27)": { "SPLITK": 1 }, "(2^19, 128, 128, 27)": { "SPLITK": 1 }, "(2^19, 128, 32, 27)": { "SPLITK": 1 }, "(2^17, 256, 256, 27)": { "SPLITK": 1 }, "(2^17, 256, 64, 27)": { "SPLITK": 1 }, "(2^14, 512, 512, 27)": { "SPLITK": 1 }, "(2^14, 512, 128, 27)": { "SPLITK": 4 }, "(2^14, 512, 2048, 27)": { "SPLITK": 1 }, "(2^17, 256, 1024, 27)": { "SPLITK": 1 }, "(2^19, 128, 512, 27)": { "SPLITK": 1 }, "(2^7, 1024, 1024, 27)": { "SPLITK": 4 }, "(2^7, 1024, 4096, 27)": { "SPLITK": 16 }, "(2^14, 128, 128, 27)": { "SPLITK": 1 }, "(2^14, 128, 32, 27)": { "SPLITK": 1 }, "(2^14, 128, 512, 27)": { "SPLITK": 1 }, "(2^16, 64, 64, 27)": { "SPLITK": 1 }, "(2^16, 64, 16, 27)": { "SPLITK": 1 }, "(2^22, 64, 64, 27)": { "SPLITK": 1 }, "(2^22, 64, 16, 27)": { "SPLITK": 1 }, "(2^12, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^10, 256, 256, 27)": { "SPLITK": 16 }, "(2^12, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^15, 512, 512, 27)": { "SPLITK": 1 }, "(2^15, 512, 128, 27)": { "SPLITK": 1 }, "(2^13, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^13, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^15, 512, 2048, 27)": { "SPLITK": 1 }, "(2^20, 128, 128, 27)": { "SPLITK": 1 }, "(2^20, 128, 32, 27)": { "SPLITK": 1 }, "(2^18, 256, 256, 27)": { "SPLITK": 1 }, "(2^18, 256, 64, 27)": { "SPLITK": 1 }, "(2^18, 256, 1024, 27)": { "SPLITK": 1 }, "(2^20, 128, 512, 27)": { "SPLITK": 1 }, "(2^16, 512, 512, 27)": { "SPLITK": 1 }, "(2^16, 512, 128, 27)": { "SPLITK": 1 }, "(2^14, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^14, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^16, 512, 2048, 27)": { "SPLITK": 1 }, "(2^23, 64, 64, 27)": { "SPLITK": 1 }, "(2^23, 64, 16, 27)": { "SPLITK": 1 }, "(2^21, 128, 128, 27)": { "SPLITK": 1 }, "(2^21, 128, 32, 27)": { "SPLITK": 1 }, "(2^19, 256, 256, 27)": { "SPLITK": 1 }, "(2^19, 256, 64, 27)": { "SPLITK": 1 }, "(2^17, 512, 512, 27)": { "SPLITK": 1 }, "(2^17, 512, 128, 27)": { "SPLITK": 1 }, "(2^15, 1024, 1024, 27)": { "SPLITK": 1 }, "(2^15, 1024, 4096, 27)": { "SPLITK": 1 }, "(2^17, 512, 2048, 27)": { "SPLITK": 1 }, "(2^19, 256, 1024, 27)": { "SPLITK": 1 }, "(2^21, 128, 512, 27)": { "SPLITK": 1 }, "(2^11, 256, 256, 27)": { "SPLITK": 1 }, "(2^11, 256, 64, 27)": { "SPLITK": 32 }, "(2^14, 64, 64, 27)": { "SPLITK": 2 }, "(2^14, 64, 16, 27)": { "SPLITK": 4 }, "(2^12, 128, 128, 27)": { "SPLITK": 4 }, "(2^12, 128, 32, 27)": { "SPLITK": 32 }, "(2^10, 256, 64, 27)": { "SPLITK": 128 }, "(2^8, 512, 512, 27)": { "SPLITK": 64 }, "(2^8, 512, 128, 27)": { "SPLITK": 256 }, "(2^15, 64, 64, 27)": { "SPLITK": 1 }, "(2^15, 64, 16, 27)": { "SPLITK": 4 }, "(2^13, 128, 128, 27)": { "SPLITK": 2 }, "(2^13, 128, 32, 27)": { "SPLITK": 32 }, "(2^9, 512, 512, 27)": { "SPLITK": 8 }, "(2^9, 512, 128, 27)": { "SPLITK": 256 }, "(2^13, 64, 64, 27)": { "SPLITK": 1 }, "(2^13, 64, 16, 27)": { "SPLITK": 1 }, "(2^9, 256, 256, 27)": { "SPLITK": 8 }, "(2^9, 256, 64, 27)": { "SPLITK": 32 }, "(2^6, 1024, 1024, 27)": { "SPLITK": 8 }, "(2^18, 512, 512, 27)": { "SPLITK": 1 }, "(2^18, 512, 2048, 27)": { "SPLITK": 1 }, "(2^20, 256, 256, 27)": { "SPLITK": 1 }, "(2^20, 256, 1024, 27)": { "SPLITK": 1 }, "(2^22, 128, 128, 27)": { "SPLITK": 1 }, "(2^22, 128, 512, 27)": { "SPLITK": 1 }, "(2^24, 64, 64, 27)": { "SPLITK": 1 } }, "flex_gemm.triton.gemm.gemm_nn.gemm_nn_kernel": {}, "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_kernel": {}, "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_splitk_kernel": {}, "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_splitk": {}, "flex_gemm.triton.gemm.gemm_nn_splitk_atomic.gemm_nn_splitk_atomic_kernel": {}, "flex_gemm.triton.gemm.gemm_nn_splitk_lock.gemm_nn_splitk_lock_kernel": {}, "flex_gemm.kernels.triton.grid_sample.indice_weighed_sum_bwd.indice_weighed_sum_bwd_input_kernel": {}, "flex_gemm.kernels.triton.grid_sample.indice_weighed_sum_fwd.indice_weighed_sum_fwd_kernel": { "(23, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 32, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 1941851, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 7877533, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 2226123, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 12748156, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 1911611, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(22, 5017088, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 1112821, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 33122502, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1284070, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 10949861, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1571524, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 12857151, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 16 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1641261, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 15495770, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1808517, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 9814710, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 32, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1780790, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 14310670, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 32, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 2113485, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 21145957, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 32 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1782768, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 14828658, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 1577958, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(24, 17012742, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 32, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 2345014, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 12887107, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 2168973, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 12754203, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 1873811, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 12506390, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 2223717, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 8, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null }, "(23, 14387524, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { "kwargs": { "BM": 16, "BK": 8 }, "num_warps": 2, "num_ctas": 1, "num_stages": 2, "num_buffers_warp_spec": 0, "num_consumer_groups": 0, "reg_dec_producer": 0, "reg_inc_consumer": 0, "maxnreg": null, "pre_hook": null } } } }