Last active
November 30, 2023 01:45
-
-
Save davidberard98/5927855a91e4818a32908493dcdb7e38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85) | |
%true = arith.constant true loc(#loc86) | |
tt.return %true : i1 loc(#loc86) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} { | |
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88) | |
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89) | |
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89) | |
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89) | |
tt.return %3 : tensor<1xf64> loc(#loc90) | |
} loc(#loc87) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { | |
%false = arith.constant false loc(#loc92) | |
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92) | |
tt.return %cst : tensor<1xi1> loc(#loc93) | |
} loc(#loc91) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30) | |
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11) | |
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31) | |
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85) | |
%true = arith.constant true loc(#loc86) | |
tt.return %true : i1 loc(#loc86) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} { | |
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88) | |
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89) | |
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89) | |
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89) | |
tt.return %3 : tensor<1xf64> loc(#loc90) | |
} loc(#loc87) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { | |
%false = arith.constant false loc(#loc92) | |
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92) | |
tt.return %cst : tensor<1xi1> loc(#loc93) | |
} loc(#loc91) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30) | |
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11) | |
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31) | |
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @promote_to_tensor__fp64__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85) | |
%true = arith.constant true loc(#loc86) | |
tt.return %true : i1 loc(#loc86) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} { | |
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88) | |
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89) | |
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89) | |
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89) | |
tt.return %3 : tensor<1xf64> loc(#loc90) | |
} loc(#loc87) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { | |
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92) | |
tt.return %cst : tensor<1xi1> loc(#loc93) | |
} loc(#loc91) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30) | |
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11) | |
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31) | |
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @promote_to_tensor__fp64__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85) | |
%true = arith.constant true loc(#loc86) | |
tt.return %true : i1 loc(#loc86) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} { | |
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc93) | |
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc90) | |
%1 = arith.uitofp %cst : tensor<1xi1> to tensor<1xf64> loc(#loc90) | |
%2 = arith.addf %0, %1 : tensor<1xf64> loc(#loc90) | |
tt.return %2 : tensor<1xf64> loc(#loc91) | |
} loc(#loc87) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc92) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc88 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31) | |
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30) | |
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc91 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11) | |
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc93 = loc(callsite(#loc88 at #loc89)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @is_floating__fp64__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85) | |
%true = arith.constant true loc(#loc86) | |
tt.return %true : i1 loc(#loc86) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1xf64> loc(#loc88) | |
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc88) | |
%1 = arith.addf %0, %cst : tensor<1xf64> loc(#loc88) | |
tt.return %1 : tensor<1xf64> loc(#loc89) | |
} loc(#loc87) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc90) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11) | |
#loc90 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @is_floating__fp64__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%true = arith.constant true loc(#loc85) | |
%cst = arith.constant dense<0.000000e+00> : tensor<1xf64> loc(#loc90) | |
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc90) | |
%1 = arith.addf %0, %cst : tensor<1xf64> loc(#loc90) | |
tt.return %true : i1 loc(#loc85) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc88) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc89) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15) | |
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29) | |
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc89 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc90 = loc(callsite(#loc86 at #loc87)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @maximum_with_index__fp64_i32_fp64_i32__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68) | |
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69) | |
%3:2 = scf.if %2 -> (i1, i1) { | |
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71) | |
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72) | |
%true = arith.constant true loc(#loc73) | |
%11 = arith.xori %10, %true : i1 loc(#loc73) | |
%12 = arith.andi %9, %11 : i1 loc(#loc74) | |
%13 = arith.ori %0, %12 : i1 loc(#loc75) | |
%14 = arith.andi %9, %10 : i1 loc(#loc76) | |
%15 = arith.ori %1, %14 : i1 loc(#loc77) | |
scf.yield %13, %15 : i1, i1 loc(#loc77) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc70) | |
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78) | |
%5 = arith.andi %3#1, %4 : i1 loc(#loc79) | |
%6 = arith.ori %3#0, %5 : i1 loc(#loc80) | |
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81) | |
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82) | |
tt.return %7, %8 : f64, i32 loc(#loc83) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} { | |
%true = arith.constant true loc(#loc85) | |
tt.return %true : i1 loc(#loc85) | |
} loc(#loc84) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc86) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc87) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc87 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @maximum_with_index__fp64_i32_fp64_i32__) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc67) | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc68) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc69) | |
%true_0 = arith.constant true loc(#loc88) | |
%2:2 = scf.if %true_0 -> (i1, i1) { | |
%8 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc73) | |
%9 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc74) | |
%10 = arith.xori %9, %true : i1 loc(#loc67) | |
%11 = arith.andi %8, %10 : i1 loc(#loc75) | |
%12 = arith.ori %0, %11 : i1 loc(#loc76) | |
%13 = arith.andi %8, %9 : i1 loc(#loc77) | |
%14 = arith.ori %1, %13 : i1 loc(#loc78) | |
scf.yield %12, %14 : i1, i1 loc(#loc78) | |
} else { | |
scf.yield %0, %1 : i1, i1 loc(#loc64) | |
} loc(#loc72) | |
%3 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc79) | |
%4 = arith.andi %2#1, %3 : i1 loc(#loc80) | |
%5 = arith.ori %2#0, %4 : i1 loc(#loc81) | |
%6 = arith.select %5, %arg0, %arg2 : f64 loc(#loc82) | |
%7 = arith.select %5, %arg1, %arg3 : i32 loc(#loc83) | |
tt.return %6, %7 : f64, i32 loc(#loc84) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc85) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc86) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc87) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc87 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc88 = loc(callsite(#loc70 at #loc71)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63) | |
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc67) | |
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc68) | |
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc69) | |
%2 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc70) | |
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc71) | |
%4 = arith.xori %3, %true : i1 loc(#loc67) | |
%5 = arith.andi %2, %4 : i1 loc(#loc72) | |
%6 = arith.ori %0, %5 : i1 loc(#loc73) | |
%7 = arith.andi %2, %3 : i1 loc(#loc74) | |
%8 = arith.ori %1, %7 : i1 loc(#loc75) | |
%9 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc76) | |
%10 = arith.andi %8, %9 : i1 loc(#loc77) | |
%11 = arith.ori %6, %10 : i1 loc(#loc78) | |
%12 = arith.select %11, %arg0, %arg2 : f64 loc(#loc79) | |
%13 = arith.select %11, %arg1, %arg3 : i32 loc(#loc80) | |
tt.return %12, %13 : f64, i32 loc(#loc81) | |
} loc(#loc66) | |
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc82) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc83) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc84) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc84 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc64 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%true = arith.constant true loc(#loc84) | |
%1 = arith.cmpf ogt, %arg2, %arg4 : f64 loc(#loc85) | |
%2 = arith.cmpf oeq, %arg2, %arg4 : f64 loc(#loc86) | |
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc87) | |
%4 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc88) | |
%5 = arith.xori %4, %true : i1 loc(#loc84) | |
%6 = arith.andi %3, %5 : i1 loc(#loc89) | |
%7 = arith.ori %1, %6 : i1 loc(#loc90) | |
%8 = arith.andi %3, %4 : i1 loc(#loc91) | |
%9 = arith.ori %2, %8 : i1 loc(#loc92) | |
%10 = arith.cmpi slt, %arg3, %arg5 : i32 loc(#loc93) | |
%11 = arith.andi %9, %10 : i1 loc(#loc94) | |
%12 = arith.ori %7, %11 : i1 loc(#loc95) | |
%13 = arith.select %12, %arg2, %arg4 : f64 loc(#loc96) | |
%14 = arith.select %12, %arg3, %arg5 : i32 loc(#loc97) | |
tt.reduce.return %13, %14 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc79) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc80) | |
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc81) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc82) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc83) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc83 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc84 = loc(callsite(#loc65 at #loc63)) | |
#loc85 = loc(callsite(#loc66 at #loc63)) | |
#loc86 = loc(callsite(#loc67 at #loc63)) | |
#loc87 = loc(callsite(#loc68 at #loc63)) | |
#loc88 = loc(callsite(#loc69 at #loc63)) | |
#loc89 = loc(callsite(#loc70 at #loc63)) | |
#loc90 = loc(callsite(#loc71 at #loc63)) | |
#loc91 = loc(callsite(#loc72 at #loc63)) | |
#loc92 = loc(callsite(#loc73 at #loc63)) | |
#loc93 = loc(callsite(#loc74 at #loc63)) | |
#loc94 = loc(callsite(#loc75 at #loc63)) | |
#loc95 = loc(callsite(#loc76 at #loc63)) | |
#loc96 = loc(callsite(#loc77 at #loc63)) | |
#loc97 = loc(callsite(#loc78 at #loc63)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc65 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 loc(#loc1) | |
%c32_i32 = arith.constant 32 : i32 loc(#loc2) | |
%0 = tt.get_program_id x : i32 loc(#loc3) | |
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4) | |
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4) | |
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5) | |
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6) | |
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7) | |
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7) | |
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8) | |
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8) | |
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9) | |
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10) | |
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11) | |
%c2_i32 = arith.constant 2 : i32 loc(#loc12) | |
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12) | |
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12) | |
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13) | |
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13) | |
%c0_i64 = arith.constant 0 : i64 loc(#loc14) | |
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14) | |
%c1_i64 = arith.constant 1 : i64 loc(#loc15) | |
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15) | |
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16) | |
%c3_i32 = arith.constant 3 : i32 loc(#loc17) | |
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17) | |
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17) | |
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18) | |
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18) | |
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18) | |
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19) | |
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19) | |
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19) | |
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20) | |
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20) | |
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20) | |
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21) | |
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21) | |
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21) | |
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22) | |
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23) | |
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23) | |
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24) | |
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24) | |
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24) | |
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25) | |
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25) | |
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26) | |
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26) | |
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27) | |
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28) | |
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28) | |
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29) | |
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30) | |
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30) | |
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30) | |
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30) | |
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31) | |
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31) | |
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33) | |
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33) | |
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33) | |
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34) | |
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34) | |
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34) | |
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35) | |
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35) | |
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35) | |
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36) | |
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37) | |
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37) | |
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38) | |
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38) | |
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38) | |
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39) | |
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39) | |
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39) | |
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40) | |
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40) | |
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40) | |
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41) | |
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41) | |
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41) | |
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42) | |
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43) | |
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43) | |
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44) | |
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45) | |
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45) | |
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45) | |
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45) | |
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46) | |
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46) | |
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47) | |
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48) | |
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49) | |
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49) | |
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50) | |
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50) | |
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51) | |
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51) | |
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52) | |
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52) | |
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53) | |
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54) | |
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55) | |
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55) | |
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55) | |
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55) | |
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56) | |
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc58) | |
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58) | |
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc84) | |
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({ | |
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)): | |
%1 = arith.cmpf ogt, %arg2, %arg4 : f64 loc(#loc85) | |
%2 = arith.cmpf oeq, %arg2, %arg4 : f64 loc(#loc86) | |
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc87) | |
%4 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc88) | |
%5 = arith.xori %4, %true : i1 loc(#loc84) | |
%6 = arith.andi %3, %5 : i1 loc(#loc89) | |
%7 = arith.ori %1, %6 : i1 loc(#loc90) | |
%8 = arith.andi %3, %4 : i1 loc(#loc91) | |
%9 = arith.ori %2, %8 : i1 loc(#loc92) | |
%10 = arith.cmpi slt, %arg3, %arg5 : i32 loc(#loc93) | |
%11 = arith.andi %9, %10 : i1 loc(#loc94) | |
%12 = arith.ori %7, %11 : i1 loc(#loc95) | |
%13 = arith.select %12, %arg2, %arg4 : f64 loc(#loc96) | |
%14 = arith.select %12, %arg3, %arg5 : i32 loc(#loc97) | |
tt.reduce.return %13, %14 : f64, i32 loc(#loc64) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc64) | |
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc79) | |
} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc80) | |
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc81) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc82) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc83) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38) | |
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc64 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11) | |
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc83 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc84 = loc(callsite(#loc63 at #loc64)) | |
#loc85 = loc(callsite(#loc66 at #loc64)) | |
#loc86 = loc(callsite(#loc67 at #loc64)) | |
#loc87 = loc(callsite(#loc68 at #loc64)) | |
#loc88 = loc(callsite(#loc69 at #loc64)) | |
#loc89 = loc(callsite(#loc70 at #loc64)) | |
#loc90 = loc(callsite(#loc71 at #loc64)) | |
#loc91 = loc(callsite(#loc72 at #loc64)) | |
#loc92 = loc(callsite(#loc73 at #loc64)) | |
#loc93 = loc(callsite(#loc74 at #loc64)) | |
#loc94 = loc(callsite(#loc75 at #loc64)) | |
#loc95 = loc(callsite(#loc76 at #loc64)) | |
#loc96 = loc(callsite(#loc77 at #loc64)) | |
#loc97 = loc(callsite(#loc78 at #loc64)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc2) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc3) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc4) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc5) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc6) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc7) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc8) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc9) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc10) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc11) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc9) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc8) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc12) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc7) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc13) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc14) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc17) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc18) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc19) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc20) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc21) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc22) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc22) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc23) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc6) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc6) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc24) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc25) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc26) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc4) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc27) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc29) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc30) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc31) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc32) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc33) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc33) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc34) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc35) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc35) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc36) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc37) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc3) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc38) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc39) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc40) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc2) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2) | |
%true = arith.constant true loc(#loc82) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc83) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc84) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc85) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc86) | |
%52 = arith.xori %51, %true : i1 loc(#loc82) | |
%53 = arith.andi %50, %52 : i1 loc(#loc87) | |
%54 = arith.ori %48, %53 : i1 loc(#loc88) | |
%55 = arith.andi %50, %51 : i1 loc(#loc89) | |
%56 = arith.ori %49, %55 : i1 loc(#loc90) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc91) | |
%58 = arith.andi %56, %57 : i1 loc(#loc92) | |
%59 = arith.ori %54, %58 : i1 loc(#loc93) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc94) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc95) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc68) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc68) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} loc(#loc62) | |
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc63) | |
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc64) | |
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc65) | |
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc66) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0) | |
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0) | |
#loc64 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0) | |
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0) | |
#loc66 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0) | |
#loc67 = loc(callsite(#loc41 at #loc42)) | |
#loc68 = loc(callsite(#loc42 at #loc43)) | |
#loc69 = loc(callsite(#loc45 at #loc42)) | |
#loc70 = loc(callsite(#loc46 at #loc42)) | |
#loc71 = loc(callsite(#loc47 at #loc42)) | |
#loc72 = loc(callsite(#loc48 at #loc42)) | |
#loc73 = loc(callsite(#loc49 at #loc42)) | |
#loc74 = loc(callsite(#loc50 at #loc42)) | |
#loc75 = loc(callsite(#loc51 at #loc42)) | |
#loc76 = loc(callsite(#loc52 at #loc42)) | |
#loc77 = loc(callsite(#loc53 at #loc42)) | |
#loc78 = loc(callsite(#loc54 at #loc42)) | |
#loc79 = loc(callsite(#loc55 at #loc42)) | |
#loc80 = loc(callsite(#loc56 at #loc42)) | |
#loc81 = loc(callsite(#loc57 at #loc42)) | |
#loc82 = loc(callsite(#loc67 at #loc43)) | |
#loc83 = loc(callsite(#loc69 at #loc43)) | |
#loc84 = loc(callsite(#loc70 at #loc43)) | |
#loc85 = loc(callsite(#loc71 at #loc43)) | |
#loc86 = loc(callsite(#loc72 at #loc43)) | |
#loc87 = loc(callsite(#loc73 at #loc43)) | |
#loc88 = loc(callsite(#loc74 at #loc43)) | |
#loc89 = loc(callsite(#loc75 at #loc43)) | |
#loc90 = loc(callsite(#loc76 at #loc43)) | |
#loc91 = loc(callsite(#loc77 at #loc43)) | |
#loc92 = loc(callsite(#loc78 at #loc43)) | |
#loc93 = loc(callsite(#loc79 at #loc43)) | |
#loc94 = loc(callsite(#loc80 at #loc43)) | |
#loc95 = loc(callsite(#loc81 at #loc43)) | |
// -----// IR Dump Before TritonRewriteTensorPointer (triton-rewrite-tensor-pointer) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before TritonCombineOps (triton-combine) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before Canonicalizer (canonicalize) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before TritonReorderBroadcast (triton-reorder-broadcast) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%true = arith.constant true loc(#loc77) | |
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4) | |
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7) | |
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8) | |
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9) | |
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10) | |
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11) | |
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14) | |
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12) | |
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11) | |
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15) | |
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10) | |
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16) | |
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17) | |
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18) | |
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20) | |
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21) | |
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22) | |
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26) | |
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9) | |
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28) | |
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29) | |
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7) | |
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30) | |
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31) | |
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33) | |
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34) | |
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35) | |
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37) | |
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38) | |
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40) | |
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6) | |
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42) | |
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43) | |
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5) | |
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5) | |
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78) | |
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79) | |
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80) | |
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81) | |
%52 = arith.xori %51, %true : i1 loc(#loc77) | |
%53 = arith.andi %50, %52 : i1 loc(#loc82) | |
%54 = arith.ori %48, %53 : i1 loc(#loc83) | |
%55 = arith.andi %50, %51 : i1 loc(#loc84) | |
%56 = arith.ori %49, %55 : i1 loc(#loc85) | |
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86) | |
%58 = arith.andi %56, %57 : i1 loc(#loc87) | |
%59 = arith.ori %54, %58 : i1 loc(#loc88) | |
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89) | |
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90) | |
tt.reduce.return %60, %61 : f64, i32 loc(#loc63) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63) | |
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59) | |
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59) | |
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60) | |
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60) | |
tt.return loc(#loc61) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49) | |
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc62 = loc(callsite(#loc1 at #loc2)) | |
#loc63 = loc(callsite(#loc2 at #loc3)) | |
#loc64 = loc(callsite(#loc45 at #loc2)) | |
#loc65 = loc(callsite(#loc46 at #loc2)) | |
#loc66 = loc(callsite(#loc47 at #loc2)) | |
#loc67 = loc(callsite(#loc48 at #loc2)) | |
#loc68 = loc(callsite(#loc49 at #loc2)) | |
#loc69 = loc(callsite(#loc50 at #loc2)) | |
#loc70 = loc(callsite(#loc51 at #loc2)) | |
#loc71 = loc(callsite(#loc52 at #loc2)) | |
#loc72 = loc(callsite(#loc53 at #loc2)) | |
#loc73 = loc(callsite(#loc54 at #loc2)) | |
#loc74 = loc(callsite(#loc55 at #loc2)) | |
#loc75 = loc(callsite(#loc56 at #loc2)) | |
#loc76 = loc(callsite(#loc57 at #loc2)) | |
#loc77 = loc(callsite(#loc62 at #loc3)) | |
#loc78 = loc(callsite(#loc64 at #loc3)) | |
#loc79 = loc(callsite(#loc65 at #loc3)) | |
#loc80 = loc(callsite(#loc66 at #loc3)) | |
#loc81 = loc(callsite(#loc67 at #loc3)) | |
#loc82 = loc(callsite(#loc68 at #loc3)) | |
#loc83 = loc(callsite(#loc69 at #loc3)) | |
#loc84 = loc(callsite(#loc70 at #loc3)) | |
#loc85 = loc(callsite(#loc71 at #loc3)) | |
#loc86 = loc(callsite(#loc72 at #loc3)) | |
#loc87 = loc(callsite(#loc73 at #loc3)) | |
#loc88 = loc(callsite(#loc74 at #loc3)) | |
#loc89 = loc(callsite(#loc75 at #loc3)) | |
#loc90 = loc(callsite(#loc76 at #loc3)) | |
// -----// IR Dump Before CSE (cse) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc44 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc76) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13) | |
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11) | |
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10) | |
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14) | |
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9) | |
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15) | |
%7 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc16) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32> loc(#loc17) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32> loc(#loc18) | |
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc19) | |
%11 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc20) | |
%12 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc21) | |
%13 = arith.divsi %12, %cst_4 : tensor<1x32xi32> loc(#loc22) | |
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc23) | |
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc24) | |
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc24) | |
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc25) | |
%18 = tt.load %16, %17, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc26) | |
%19 = arith.select %10, %18, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27) | |
%20 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc28) | |
%21 = arith.divsi %20, %cst_4 : tensor<1x32xi32> loc(#loc29) | |
%22 = arith.addi %21, %cst_2 : tensor<1x32xi32> loc(#loc8) | |
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32> loc(#loc30) | |
%24 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc31) | |
%25 = arith.addi %24, %cst_2 : tensor<1x32xi32> loc(#loc32) | |
%26 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc33) | |
%27 = arith.divsi %26, %cst_4 : tensor<1x32xi32> loc(#loc34) | |
%28 = arith.addi %25, %27 : tensor<1x32xi32> loc(#loc35) | |
%29 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36) | |
%30 = tt.addptr %29, %28 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36) | |
%31 = arith.andi %2, %23 : tensor<1x32xi1> loc(#loc37) | |
%32 = tt.load %30, %31, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38) | |
%33 = arith.select %23, %32, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39) | |
%34 = arith.addf %33, %19 : tensor<1x32xf64> loc(#loc40) | |
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7) | |
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41) | |
%37 = arith.addf %36, %35 : tensor<1x32xf64> loc(#loc42) | |
%38 = arith.divf %34, %37 : tensor<1x32xf64> loc(#loc43) | |
%39 = arith.select %2, %38, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2) | |
%40:2 = "tt.reduce"(%39, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%45 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc77) | |
%46 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc78) | |
%47 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc79) | |
%48 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc80) | |
%49 = arith.xori %48, %true : i1 loc(#loc76) | |
%50 = arith.andi %47, %49 : i1 loc(#loc81) | |
%51 = arith.ori %45, %50 : i1 loc(#loc82) | |
%52 = arith.andi %47, %48 : i1 loc(#loc83) | |
%53 = arith.ori %46, %52 : i1 loc(#loc84) | |
%54 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc85) | |
%55 = arith.andi %53, %54 : i1 loc(#loc86) | |
%56 = arith.ori %51, %55 : i1 loc(#loc87) | |
%57 = arith.select %56, %arg4, %arg6 : f64 loc(#loc88) | |
%58 = arith.select %56, %arg5, %arg7 : i32 loc(#loc89) | |
tt.reduce.return %57, %58 : f64, i32 loc(#loc62) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc62) | |
%41 = tt.expand_dims %40#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58) | |
%42 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%43 = tt.splat %42 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3) | |
%44 = arith.extsi %41 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc59) | |
tt.store %43, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc59) | |
tt.return loc(#loc60) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71) | |
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc61 = loc(callsite(#loc4 at #loc5)) | |
#loc62 = loc(callsite(#loc5 at #loc6)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc49 at #loc5)) | |
#loc68 = loc(callsite(#loc50 at #loc5)) | |
#loc69 = loc(callsite(#loc51 at #loc5)) | |
#loc70 = loc(callsite(#loc52 at #loc5)) | |
#loc71 = loc(callsite(#loc53 at #loc5)) | |
#loc72 = loc(callsite(#loc54 at #loc5)) | |
#loc73 = loc(callsite(#loc55 at #loc5)) | |
#loc74 = loc(callsite(#loc56 at #loc5)) | |
#loc75 = loc(callsite(#loc57 at #loc5)) | |
#loc76 = loc(callsite(#loc61 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
#loc81 = loc(callsite(#loc67 at #loc6)) | |
#loc82 = loc(callsite(#loc68 at #loc6)) | |
#loc83 = loc(callsite(#loc69 at #loc6)) | |
#loc84 = loc(callsite(#loc70 at #loc6)) | |
#loc85 = loc(callsite(#loc71 at #loc6)) | |
#loc86 = loc(callsite(#loc72 at #loc6)) | |
#loc87 = loc(callsite(#loc73 at #loc6)) | |
#loc88 = loc(callsite(#loc74 at #loc6)) | |
#loc89 = loc(callsite(#loc75 at #loc6)) | |
// -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13) | |
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11) | |
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10) | |
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14) | |
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9) | |
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15) | |
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16) | |
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17) | |
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18) | |
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19) | |
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20) | |
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20) | |
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21) | |
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22) | |
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23) | |
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8) | |
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24) | |
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25) | |
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26) | |
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27) | |
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28) | |
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29) | |
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30) | |
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31) | |
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7) | |
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33) | |
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34) | |
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2) | |
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%39 = arith.xori %38, %true : i1 loc(#loc67) | |
%40 = arith.andi %37, %39 : i1 loc(#loc72) | |
%41 = arith.ori %35, %40 : i1 loc(#loc73) | |
%42 = arith.andi %37, %38 : i1 loc(#loc74) | |
%43 = arith.ori %36, %42 : i1 loc(#loc75) | |
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%45 = arith.andi %43, %44 : i1 loc(#loc77) | |
%46 = arith.ori %41, %45 : i1 loc(#loc78) | |
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79) | |
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %47, %48 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53) | |
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49) | |
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3) | |
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50) | |
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before SymbolDCE (symbol-dce) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13) | |
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11) | |
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10) | |
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14) | |
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9) | |
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15) | |
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16) | |
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17) | |
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18) | |
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19) | |
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20) | |
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20) | |
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21) | |
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22) | |
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23) | |
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8) | |
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24) | |
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25) | |
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26) | |
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27) | |
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28) | |
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29) | |
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30) | |
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31) | |
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7) | |
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33) | |
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34) | |
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2) | |
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%39 = arith.xori %38, %true : i1 loc(#loc67) | |
%40 = arith.andi %37, %39 : i1 loc(#loc72) | |
%41 = arith.ori %35, %40 : i1 loc(#loc73) | |
%42 = arith.andi %37, %38 : i1 loc(#loc74) | |
%43 = arith.ori %36, %42 : i1 loc(#loc75) | |
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%45 = arith.andi %43, %44 : i1 loc(#loc77) | |
%46 = arith.ori %41, %45 : i1 loc(#loc78) | |
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79) | |
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %47, %48 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53) | |
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49) | |
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3) | |
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50) | |
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before ConvertTritonToTritonGPU (convert-triton-to-tritongpu) ('builtin.module' operation) //----- // | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12) | |
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13) | |
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11) | |
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10) | |
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14) | |
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9) | |
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15) | |
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16) | |
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17) | |
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18) | |
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19) | |
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20) | |
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20) | |
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21) | |
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22) | |
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23) | |
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8) | |
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24) | |
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25) | |
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26) | |
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27) | |
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28) | |
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29) | |
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30) | |
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31) | |
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7) | |
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32) | |
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33) | |
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34) | |
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2) | |
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%39 = arith.xori %38, %true : i1 loc(#loc67) | |
%40 = arith.andi %37, %39 : i1 loc(#loc72) | |
%41 = arith.ori %35, %40 : i1 loc(#loc73) | |
%42 = arith.andi %37, %38 : i1 loc(#loc74) | |
%43 = arith.ori %36, %42 : i1 loc(#loc75) | |
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%45 = arith.andi %43, %44 : i1 loc(#loc77) | |
%46 = arith.ori %41, %45 : i1 loc(#loc78) | |
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79) | |
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %47, %48 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53) | |
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49) | |
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3) | |
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50) | |
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before TritonGPUCoalesce (tritongpu-coalesce) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12) | |
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13) | |
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13) | |
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11) | |
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11) | |
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10) | |
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14) | |
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16) | |
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17) | |
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18) | |
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19) | |
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20) | |
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20) | |
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21) | |
%16 = tt.load %14, %15, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked> loc(#loc22) | |
%17 = arith.select %10, %16, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23) | |
%18 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8) | |
%19 = arith.cmpi slt, %18, %9 : tensor<1x32xi32, #blocked> loc(#loc24) | |
%20 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25) | |
%21 = arith.addi %20, %8 : tensor<1x32xi32, #blocked> loc(#loc26) | |
%22 = tt.addptr %13, %21 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27) | |
%23 = arith.andi %4, %19 : tensor<1x32xi1, #blocked> loc(#loc28) | |
%24 = tt.load %22, %23, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked> loc(#loc29) | |
%25 = arith.select %19, %24, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30) | |
%26 = arith.addf %25, %17 : tensor<1x32xf64, #blocked> loc(#loc31) | |
%27 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7) | |
%28 = arith.select %19, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32) | |
%29 = arith.addf %28, %27 : tensor<1x32xf64, #blocked> loc(#loc33) | |
%30 = arith.divf %26, %29 : tensor<1x32xf64, #blocked> loc(#loc34) | |
%31 = arith.select %4, %30, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2) | |
%32:2 = "tt.reduce"(%31, %3) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%40 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%41 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%42 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%43 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%44 = arith.xori %43, %true : i1 loc(#loc67) | |
%45 = arith.andi %42, %44 : i1 loc(#loc72) | |
%46 = arith.ori %40, %45 : i1 loc(#loc73) | |
%47 = arith.andi %42, %43 : i1 loc(#loc74) | |
%48 = arith.ori %41, %47 : i1 loc(#loc75) | |
%49 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%50 = arith.andi %48, %49 : i1 loc(#loc77) | |
%51 = arith.ori %46, %50 : i1 loc(#loc78) | |
%52 = arith.select %51, %arg4, %arg6 : f64 loc(#loc79) | |
%53 = arith.select %51, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %52, %53 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53) | |
%33 = triton_gpu.convert_layout %32#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49) | |
%34 = triton_gpu.convert_layout %33 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> loc(#loc49) | |
%35 = tt.expand_dims %34 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>>) -> tensor<1x1xi32, #blocked3> loc(#loc49) | |
%36 = triton_gpu.convert_layout %35 : (tensor<1x1xi32, #blocked3>) -> tensor<1x1xi32, #blocked4> loc(#loc50) | |
%37 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%38 = tt.splat %37 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked4> loc(#loc3) | |
%39 = arith.extsi %36 : tensor<1x1xi32, #blocked4> to tensor<1x1xi64, #blocked4> loc(#loc50) | |
tt.store %38, %39 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked4> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before TritonGPUPlanCTAPass (triton-nvidia-gpu-plan-cta) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12) | |
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13) | |
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13) | |
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11) | |
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11) | |
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10) | |
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14) | |
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16) | |
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17) | |
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18) | |
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19) | |
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20) | |
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20) | |
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21) | |
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22) | |
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22) | |
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22) | |
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22) | |
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23) | |
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8) | |
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24) | |
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25) | |
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26) | |
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27) | |
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28) | |
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29) | |
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29) | |
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29) | |
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29) | |
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29) | |
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30) | |
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31) | |
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7) | |
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32) | |
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33) | |
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34) | |
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2) | |
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%54 = arith.xori %53, %true : i1 loc(#loc67) | |
%55 = arith.andi %52, %54 : i1 loc(#loc72) | |
%56 = arith.ori %50, %55 : i1 loc(#loc73) | |
%57 = arith.andi %52, %53 : i1 loc(#loc74) | |
%58 = arith.ori %51, %57 : i1 loc(#loc75) | |
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%60 = arith.andi %58, %59 : i1 loc(#loc77) | |
%61 = arith.ori %56, %60 : i1 loc(#loc78) | |
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79) | |
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %62, %63 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53) | |
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49) | |
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49) | |
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49) | |
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50) | |
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3) | |
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50) | |
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50) | |
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before TritonGPURewriteTensorPointer (tritongpu-rewrite-tensor-pointer) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12) | |
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13) | |
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13) | |
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11) | |
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11) | |
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10) | |
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14) | |
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16) | |
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17) | |
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18) | |
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19) | |
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20) | |
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20) | |
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21) | |
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22) | |
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22) | |
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22) | |
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22) | |
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23) | |
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8) | |
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24) | |
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25) | |
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26) | |
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27) | |
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28) | |
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29) | |
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29) | |
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29) | |
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29) | |
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29) | |
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30) | |
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31) | |
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7) | |
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32) | |
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33) | |
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34) | |
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2) | |
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%54 = arith.xori %53, %true : i1 loc(#loc67) | |
%55 = arith.andi %52, %54 : i1 loc(#loc72) | |
%56 = arith.ori %50, %55 : i1 loc(#loc73) | |
%57 = arith.andi %52, %53 : i1 loc(#loc74) | |
%58 = arith.ori %51, %57 : i1 loc(#loc75) | |
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%60 = arith.andi %58, %59 : i1 loc(#loc77) | |
%61 = arith.ori %56, %60 : i1 loc(#loc78) | |
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79) | |
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %62, %63 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53) | |
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49) | |
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49) | |
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49) | |
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50) | |
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3) | |
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50) | |
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50) | |
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before TritonGPUPlanCTAPass (triton-nvidia-gpu-plan-cta) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12) | |
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13) | |
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13) | |
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11) | |
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11) | |
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10) | |
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14) | |
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16) | |
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17) | |
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18) | |
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19) | |
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20) | |
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20) | |
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21) | |
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22) | |
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22) | |
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22) | |
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22) | |
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23) | |
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8) | |
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24) | |
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25) | |
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26) | |
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27) | |
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28) | |
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29) | |
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29) | |
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29) | |
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29) | |
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29) | |
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30) | |
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31) | |
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7) | |
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32) | |
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33) | |
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34) | |
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2) | |
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%54 = arith.xori %53, %true : i1 loc(#loc67) | |
%55 = arith.andi %52, %54 : i1 loc(#loc72) | |
%56 = arith.ori %50, %55 : i1 loc(#loc73) | |
%57 = arith.andi %52, %53 : i1 loc(#loc74) | |
%58 = arith.ori %51, %57 : i1 loc(#loc75) | |
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%60 = arith.andi %58, %59 : i1 loc(#loc77) | |
%61 = arith.ori %56, %60 : i1 loc(#loc78) | |
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79) | |
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %62, %63 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53) | |
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49) | |
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49) | |
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49) | |
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50) | |
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3) | |
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50) | |
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50) | |
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> | |
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> | |
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0) | |
#loc35 = loc(unknown) | |
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} { | |
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1) | |
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2) | |
%c0_i32 = arith.constant 0 : i32 loc(#loc3) | |
%true = arith.constant true loc(#loc67) | |
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7) | |
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8) | |
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9) | |
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10) | |
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11) | |
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12) | |
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13) | |
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13) | |
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11) | |
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11) | |
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10) | |
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14) | |
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9) | |
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15) | |
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16) | |
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17) | |
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18) | |
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19) | |
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20) | |
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20) | |
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21) | |
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22) | |
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22) | |
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22) | |
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22) | |
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22) | |
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23) | |
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8) | |
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24) | |
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25) | |
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26) | |
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27) | |
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28) | |
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29) | |
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29) | |
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29) | |
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29) | |
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29) | |
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30) | |
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31) | |
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7) | |
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32) | |
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33) | |
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34) | |
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2) | |
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)): | |
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68) | |
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69) | |
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70) | |
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71) | |
%54 = arith.xori %53, %true : i1 loc(#loc67) | |
%55 = arith.andi %52, %54 : i1 loc(#loc72) | |
%56 = arith.ori %50, %55 : i1 loc(#loc73) | |
%57 = arith.andi %52, %53 : i1 loc(#loc74) | |
%58 = arith.ori %51, %57 : i1 loc(#loc75) | |
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76) | |
%60 = arith.andi %58, %59 : i1 loc(#loc77) | |
%61 = arith.ori %56, %60 : i1 loc(#loc78) | |
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79) | |
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80) | |
tt.reduce.return %62, %63 : f64, i32 loc(#loc53) | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53) | |
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49) | |
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49) | |
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49) | |
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50) | |
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3) | |
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3) | |
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50) | |
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50) | |
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50) | |
tt.return loc(#loc51) | |
} loc(#loc) | |
} loc(#loc) | |
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36) | |
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35) | |
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25) | |
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32) | |
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42) | |
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63) | |
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34) | |
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17) | |
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15) | |
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18) | |
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21) | |
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26) | |
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34) | |
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20) | |
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22) | |
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16) | |
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18) | |
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49) | |
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56) | |
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30) | |
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98) | |
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90) | |
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32) | |
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20) | |
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52) | |
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61) | |
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31) | |
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103) | |
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95) | |
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35) | |
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20) | |
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35) | |
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20) | |
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20) | |
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21) | |
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23) | |
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29) | |
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29) | |
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28) | |
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16) | |
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29) | |
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17) | |
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31) | |
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21) | |
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12) | |
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35) | |
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69) | |
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22) | |
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68) | |
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4) | |
#loc52 = loc(callsite(#loc4 at #loc5)) | |
#loc53 = loc(callsite(#loc5 at #loc6)) | |
#loc54 = loc(callsite(#loc36 at #loc5)) | |
#loc55 = loc(callsite(#loc37 at #loc5)) | |
#loc56 = loc(callsite(#loc38 at #loc5)) | |
#loc57 = loc(callsite(#loc39 at #loc5)) | |
#loc58 = loc(callsite(#loc40 at #loc5)) | |
#loc59 = loc(callsite(#loc41 at #loc5)) | |
#loc60 = loc(callsite(#loc42 at #loc5)) | |
#loc61 = loc(callsite(#loc43 at #loc5)) | |
#loc62 = loc(callsite(#loc44 at #loc5)) | |
#loc63 = loc(callsite(#loc45 at #loc5)) | |
#loc64 = loc(callsite(#loc46 at #loc5)) | |
#loc65 = loc(callsite(#loc47 at #loc5)) | |
#loc66 = loc(callsite(#loc48 at #loc5)) | |
#loc67 = loc(callsite(#loc52 at #loc6)) | |
#loc68 = loc(callsite(#loc54 at #loc6)) | |
#loc69 = loc(callsite(#loc55 at #loc6)) | |
#loc70 = loc(callsite(#loc56 at #loc6)) | |
#loc71 = loc(callsite(#loc57 at #loc6)) | |
#loc72 = loc(callsite(#loc58 at #loc6)) | |
#loc73 = loc(callsite(#loc59 at #loc6)) | |
#loc74 = loc(callsite(#loc60 at #loc6)) | |
#loc75 = loc(callsite(#loc61 at #loc6)) | |
#loc76 = loc(callsite(#loc62 at #loc6)) | |
#loc77 = loc(callsite(#loc63 at #loc6)) | |
#loc78 = loc(callsite(#loc64 at #loc6)) | |
#loc79 = loc(callsite(#loc65 at #loc6)) | |
#loc80 = loc(callsite(#loc66 at #loc6)) | |
loc(callsite("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42 at "/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)): error: 'tt.reduce' op inferred type(s) 'tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>', 'tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>' are incompatible with return type(s) of operation 'tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>', 'tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>' | |
loc(callsite("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42 at "/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)): error: 'tt.reduce' op failed to infer returned types | |
// -----// IR Dump After TritonGPURemoveLayoutConversions Failed (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- // | |
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> | |
"builtin.module"() ({ | |
"tt.func"() <{arg_attrs = [{tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {}, {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}], function_type = (!tt.ptr<f64, 1>, !tt.ptr<i64, 1>, i32, i32) -> (), sym_name = "triton__0d1d23de", sym_visibility = "public"}> ({ | |
^bb0(%arg0: !tt.ptr<f64, 1>, %arg1: !tt.ptr<i64, 1>, %arg2: i32, %arg3: i32): | |
%0 = "arith.constant"() <{value = dense<0.000000e+00> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked> | |
%1 = "arith.constant"() <{value = dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked> | |
%2 = "arith.constant"() <{value = dense<1.000000e+00> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked> | |
%3 = "arith.constant"() <{value = dense<1> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked> | |
%4 = "arith.constant"() <{value = dense<3> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked> | |
%5 = "arith.constant"() <{value = dense<2> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked> | |
%6 = "arith.constant"() <{value = dense<32> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked> | |
%7 = "arith.constant"() <{value = true}> : () -> i1 | |
%8 = "arith.constant"() <{value = 0 : i32}> : () -> i32 | |
%9 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%10 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%11 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%12 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%13 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%14 = "tt.expand_dims"(%9) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked> | |
%15 = "tt.expand_dims"(%10) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked> | |
%16 = "tt.expand_dims"(%11) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked> | |
%17 = "tt.expand_dims"(%12) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked> | |
%18 = "tt.expand_dims"(%13) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked> | |
%19 = "arith.cmpi"(%15, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%20 = "arith.cmpi"(%17, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%21 = "arith.cmpi"(%18, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%22 = "arith.remsi"(%14, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%23 = "arith.remsi"(%15, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%24 = "arith.remsi"(%16, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%25 = "arith.remsi"(%17, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%26 = "arith.remsi"(%18, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%27 = "arith.divsi"(%14, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%28 = "arith.divsi"(%16, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%29 = "arith.muli"(%22, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%30 = "arith.muli"(%23, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%31 = "arith.muli"(%24, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%32 = "arith.muli"(%25, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%33 = "arith.muli"(%26, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%34 = "arith.divsi"(%29, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%35 = "arith.divsi"(%30, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%36 = "arith.divsi"(%31, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%37 = "arith.divsi"(%32, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%38 = "arith.divsi"(%33, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%39 = "arith.addi"(%35, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%40 = "arith.addi"(%37, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%41 = "arith.addi"(%38, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%42 = "arith.cmpi"(%35, %39) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%43 = "arith.cmpi"(%38, %41) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%44 = "arith.muli"(%27, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%45 = "arith.muli"(%28, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%46 = "arith.addi"(%44, %34) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%47 = "tt.splat"(%arg0) : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> | |
%48 = "tt.splat"(%arg0) : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> | |
%49 = "tt.addptr"(%47, %46) : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> | |
%50 = "arith.andi"(%19, %42) : (tensor<1x32xi1, #blocked>, tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked> | |
%51 = "tt.load"(%49, %50, %0) <{cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>}> : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%52 = "arith.select"(%43, %51, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%53 = "arith.addi"(%37, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%54 = "arith.addi"(%38, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%55 = "arith.cmpi"(%53, %40) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%56 = "arith.cmpi"(%54, %41) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked> | |
%57 = "arith.addi"(%45, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%58 = "arith.addi"(%57, %36) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked> | |
%59 = "tt.addptr"(%48, %58) : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> | |
%60 = "arith.andi"(%20, %55) : (tensor<1x32xi1, #blocked>, tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked> | |
%61 = "tt.load"(%59, %60, %0) <{cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>}> : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%62 = "arith.select"(%56, %61, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%63 = "arith.addf"(%62, %52) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%64 = "arith.select"(%43, %2, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%65 = "arith.select"(%56, %2, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%66 = "arith.addf"(%65, %64) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%67 = "arith.divf"(%63, %66) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%68 = "arith.select"(%21, %67, %1) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked> | |
%69:2 = "tt.reduce"(%68, %18) <{axis = 1 : i32}> ({ | |
^bb0(%arg4: f64, %arg5: i32, %arg6: f64, %arg7: i32): | |
%74 = "arith.cmpf"(%arg4, %arg6) <{predicate = 2 : i64}> : (f64, f64) -> i1 | |
%75 = "arith.cmpf"(%arg4, %arg6) <{predicate = 1 : i64}> : (f64, f64) -> i1 | |
%76 = "arith.cmpf"(%arg4, %arg4) <{predicate = 13 : i64}> : (f64, f64) -> i1 | |
%77 = "arith.cmpf"(%arg6, %arg6) <{predicate = 13 : i64}> : (f64, f64) -> i1 | |
%78 = "arith.xori"(%77, %7) : (i1, i1) -> i1 | |
%79 = "arith.andi"(%76, %78) : (i1, i1) -> i1 | |
%80 = "arith.ori"(%74, %79) : (i1, i1) -> i1 | |
%81 = "arith.andi"(%76, %77) : (i1, i1) -> i1 | |
%82 = "arith.ori"(%75, %81) : (i1, i1) -> i1 | |
%83 = "arith.cmpi"(%arg5, %arg7) <{predicate = 2 : i64}> : (i32, i32) -> i1 | |
%84 = "arith.andi"(%82, %83) : (i1, i1) -> i1 | |
%85 = "arith.ori"(%80, %84) : (i1, i1) -> i1 | |
%86 = "arith.select"(%85, %arg4, %arg6) : (i1, f64, f64) -> f64 | |
%87 = "arith.select"(%85, %arg5, %arg7) : (i1, i32, i32) -> i32 | |
"tt.reduce.return"(%86, %87) : (f64, i32) -> () | |
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) | |
%70 = "tt.expand_dims"(%69#1) <{axis = 1 : i32}> : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xi32, #blocked> | |
%71 = "tt.addptr"(%arg1, %8) : (!tt.ptr<i64, 1>, i32) -> !tt.ptr<i64, 1> | |
%72 = "arith.extsi"(%70) : (tensor<1x1xi32, #blocked>) -> tensor<1x1xi64, #blocked> | |
%73 = "tt.splat"(%71) : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked> | |
"tt.store"(%73, %72) <{cache = 1 : i32, evict = 1 : i32}> : (tensor<1x1x!tt.ptr<i64, 1>, #blocked>, tensor<1x1xi64, #blocked>) -> () | |
"tt.return"() : () -> () | |
}) {noinline = false} : () -> () | |
}) {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} : () -> () | |
concurrent.futures.process._RemoteTraceback: | |
""" | |
Traceback (most recent call last): | |
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker | |
r = call_item.fn(*call_item.args, **call_item.kwargs) | |
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2280, in _worker_compile | |
kernel.precompile(warm_cache_only_with_cc=cc) | |
File "/home/dberard/local/pytorch/torch/_inductor/triton_heuristics.py", line 188, in precompile | |
compiled_binary, launcher = self._precompile_config( | |
File "/home/dberard/local/pytorch/torch/_inductor/triton_heuristics.py", line 291, in _precompile_config | |
triton.compile( | |
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 543, in compile | |
next_module = compile_kernel(module) | |
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 437, in <lambda> | |
stages["ttgir"] = (lambda path: parse_mlir_module(path, context), lambda src: optimize_ttgir( | |
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 151, in optimize_ttgir | |
pm.run(mod) | |
RuntimeError: PassManager::run failed | |
""" | |
The above exception was the direct cause of the following exception: | |
Traceback (most recent call last): | |
File "/home/dberard/local/scripts/bintriton.py", line 93, in <module> | |
async_compile.wait(globals()) | |
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2465, in wait | |
scope[key] = result.result() | |
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2308, in result | |
self.future.result() | |
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result | |
return self.__get_result() | |
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result | |
raise self._exception | |
RuntimeError: PassManager::run failed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment