-
-
Save Jokeren/6da57bfcc8579931f0418917cafc4e12 to your computer and use it in GitHub Desktop.
bug.ptx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.0 | |
.target sm_80 | |
.address_size 64 | |
// .globl triton__0d1d2d3d | |
.visible .entry triton__0d1d2d3d( | |
.param .u64 triton__0d1d2d3d_param_0, | |
.param .u64 triton__0d1d2d3d_param_1, | |
.param .u64 triton__0d1d2d3d_param_2, | |
.param .u32 triton__0d1d2d3d_param_3 | |
) | |
.maxntid 128, 1, 1 | |
{ | |
.reg .pred %p<18>; | |
.reg .b16 %rs<17>; | |
.reg .b32 %r<178>; | |
.reg .b64 %rd<38>; | |
ld.param.u64 %rd18, [triton__0d1d2d3d_param_0]; | |
ld.param.u64 %rd19, [triton__0d1d2d3d_param_1]; | |
mov.u32 %r13, %tid.x; | |
shl.b32 %r14, %r13, 3; | |
ld.param.u64 %rd20, [triton__0d1d2d3d_param_2]; | |
and.b32 %r15, %r14, 1016; | |
mov.u32 %r16, %ctaid.x; | |
shl.b32 %r17, %r16, 10; | |
or.b32 %r18, %r15, %r17; | |
or.b32 %r20, %r18, 1; | |
or.b32 %r21, %r18, 2; | |
or.b32 %r22, %r18, 3; | |
or.b32 %r23, %r18, 4; | |
or.b32 %r24, %r18, 5; | |
or.b32 %r25, %r18, 6; | |
or.b32 %r26, %r18, 7; | |
mul.hi.s32 %r27, %r18, 715827883; | |
shr.u32 %r28, %r27, 31; | |
shr.s32 %r29, %r27, 1; | |
add.s32 %r30, %r29, %r28; | |
mul.lo.s32 %r31, %r30, 12; | |
sub.s32 %r32, %r18, %r31; | |
mul.hi.s32 %r33, %r20, 715827883; | |
shr.u32 %r34, %r33, 31; | |
shr.s32 %r35, %r33, 1; | |
add.s32 %r36, %r35, %r34; | |
mul.lo.s32 %r37, %r36, 12; | |
sub.s32 %r38, %r20, %r37; | |
mul.hi.s32 %r39, %r21, 715827883; | |
shr.u32 %r40, %r39, 31; | |
shr.s32 %r41, %r39, 1; | |
add.s32 %r42, %r41, %r40; | |
mul.lo.s32 %r43, %r42, 12; | |
sub.s32 %r44, %r21, %r43; | |
mul.hi.s32 %r45, %r22, 715827883; | |
shr.u32 %r46, %r45, 31; | |
shr.s32 %r47, %r45, 1; | |
add.s32 %r48, %r47, %r46; | |
mul.lo.s32 %r49, %r48, 12; | |
sub.s32 %r50, %r22, %r49; | |
mul.hi.s32 %r51, %r23, 715827883; | |
shr.u32 %r52, %r51, 31; | |
shr.s32 %r53, %r51, 1; | |
add.s32 %r54, %r53, %r52; | |
mul.lo.s32 %r55, %r54, 12; | |
sub.s32 %r56, %r23, %r55; | |
mul.hi.s32 %r57, %r24, 715827883; | |
shr.u32 %r58, %r57, 31; | |
shr.s32 %r59, %r57, 1; | |
add.s32 %r60, %r59, %r58; | |
mul.lo.s32 %r61, %r60, 12; | |
sub.s32 %r62, %r24, %r61; | |
mul.hi.s32 %r63, %r25, 715827883; | |
shr.u32 %r64, %r63, 31; | |
shr.s32 %r65, %r63, 1; | |
add.s32 %r66, %r65, %r64; | |
mul.lo.s32 %r67, %r66, 12; | |
sub.s32 %r68, %r25, %r67; | |
mul.hi.s32 %r69, %r26, 715827883; | |
shr.u32 %r70, %r69, 31; | |
shr.s32 %r71, %r69, 1; | |
add.s32 %r72, %r71, %r70; | |
mul.lo.s32 %r73, %r72, 12; | |
sub.s32 %r74, %r26, %r73; | |
mul.wide.s32 %rd21, %r32, 4; | |
add.s64 %rd1, %rd18, %rd21; | |
mul.wide.s32 %rd22, %r38, 4; | |
add.s64 %rd2, %rd18, %rd22; | |
mul.wide.s32 %rd23, %r44, 4; | |
add.s64 %rd3, %rd18, %rd23; | |
mul.wide.s32 %rd24, %r50, 4; | |
add.s64 %rd4, %rd18, %rd24; | |
mul.wide.s32 %rd25, %r56, 4; | |
add.s64 %rd5, %rd18, %rd25; | |
mul.wide.s32 %rd26, %r62, 4; | |
add.s64 %rd6, %rd18, %rd26; | |
mul.wide.s32 %rd27, %r68, 4; | |
add.s64 %rd7, %rd18, %rd27; | |
mul.wide.s32 %rd28, %r74, 4; | |
add.s64 %rd8, %rd18, %rd28; | |
mov.pred %p1, -1; | |
@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; | |
@%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; | |
@%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ]; | |
@%p1 ld.global.b32 { %r4 }, [ %rd4 + 0 ]; | |
@%p1 ld.global.b32 { %r5 }, [ %rd5 + 0 ]; | |
@%p1 ld.global.b32 { %r6 }, [ %rd6 + 0 ]; | |
@%p1 ld.global.b32 { %r7 }, [ %rd7 + 0 ]; | |
@%p1 ld.global.b32 { %r8 }, [ %rd8 + 0 ]; | |
mul.wide.s32 %rd29, %r18, 2; | |
add.s64 %rd9, %rd19, %rd29; | |
@%p1 ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd9 + 0 ]; | |
cvt.u16.u32 %rs2, %r9; | |
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r9; } | |
cvt.u16.u32 %rs6, %r10; | |
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r10; } | |
cvt.u16.u32 %rs10, %r11; | |
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r11; } | |
cvt.u16.u32 %rs14, %r12; | |
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r12; } | |
mul.hi.s32 %r75, %r30, 715827883; | |
shr.u32 %r76, %r75, 31; | |
shr.s32 %r77, %r75, 1; | |
add.s32 %r78, %r77, %r76; | |
mul.lo.s32 %r79, %r78, 12; | |
sub.s32 %r80, %r30, %r79; | |
mul.hi.s32 %r81, %r18, 954437177; | |
shr.u32 %r82, %r81, 31; | |
shr.s32 %r83, %r81, 5; | |
add.s32 %r84, %r83, %r82; | |
mul.hi.s32 %r85, %r36, 715827883; | |
shr.u32 %r86, %r85, 31; | |
shr.s32 %r87, %r85, 1; | |
add.s32 %r88, %r87, %r86; | |
mul.lo.s32 %r89, %r88, 12; | |
sub.s32 %r90, %r36, %r89; | |
mul.hi.s32 %r91, %r20, 954437177; | |
shr.u32 %r92, %r91, 31; | |
shr.s32 %r93, %r91, 5; | |
add.s32 %r94, %r93, %r92; | |
mul.hi.s32 %r95, %r42, 715827883; | |
shr.u32 %r96, %r95, 31; | |
shr.s32 %r97, %r95, 1; | |
add.s32 %r98, %r97, %r96; | |
mul.lo.s32 %r99, %r98, 12; | |
sub.s32 %r100, %r42, %r99; | |
mul.hi.s32 %r101, %r21, 954437177; | |
shr.u32 %r102, %r101, 31; | |
shr.s32 %r103, %r101, 5; | |
add.s32 %r104, %r103, %r102; | |
mul.hi.s32 %r105, %r48, 715827883; | |
shr.u32 %r106, %r105, 31; | |
shr.s32 %r107, %r105, 1; | |
add.s32 %r108, %r107, %r106; | |
mul.lo.s32 %r109, %r108, 12; | |
sub.s32 %r110, %r48, %r109; | |
mul.hi.s32 %r111, %r22, 954437177; | |
shr.u32 %r112, %r111, 31; | |
shr.s32 %r113, %r111, 5; | |
add.s32 %r114, %r113, %r112; | |
mul.hi.s32 %r115, %r54, 715827883; | |
shr.u32 %r116, %r115, 31; | |
shr.s32 %r117, %r115, 1; | |
add.s32 %r118, %r117, %r116; | |
mul.lo.s32 %r119, %r118, 12; | |
sub.s32 %r120, %r54, %r119; | |
mul.hi.s32 %r121, %r23, 954437177; | |
shr.u32 %r122, %r121, 31; | |
shr.s32 %r123, %r121, 5; | |
add.s32 %r124, %r123, %r122; | |
mul.hi.s32 %r125, %r60, 715827883; | |
shr.u32 %r126, %r125, 31; | |
shr.s32 %r127, %r125, 1; | |
add.s32 %r128, %r127, %r126; | |
mul.lo.s32 %r129, %r128, 12; | |
sub.s32 %r130, %r60, %r129; | |
mul.hi.s32 %r131, %r24, 954437177; | |
shr.u32 %r132, %r131, 31; | |
shr.s32 %r133, %r131, 5; | |
add.s32 %r134, %r133, %r132; | |
mul.hi.s32 %r135, %r66, 715827883; | |
shr.u32 %r136, %r135, 31; | |
shr.s32 %r137, %r135, 1; | |
add.s32 %r138, %r137, %r136; | |
mul.lo.s32 %r139, %r138, 12; | |
sub.s32 %r140, %r66, %r139; | |
mul.hi.s32 %r141, %r25, 954437177; | |
shr.u32 %r142, %r141, 31; | |
shr.s32 %r143, %r141, 5; | |
add.s32 %r144, %r143, %r142; | |
mul.hi.s32 %r145, %r72, 715827883; | |
shr.u32 %r146, %r145, 31; | |
shr.s32 %r147, %r145, 1; | |
add.s32 %r148, %r147, %r146; | |
mul.lo.s32 %r149, %r148, 12; | |
sub.s32 %r150, %r72, %r149; | |
mul.hi.s32 %r151, %r26, 954437177; | |
shr.u32 %r152, %r151, 31; | |
shr.s32 %r153, %r151, 5; | |
add.s32 %r154, %r153, %r152; | |
mad.lo.s32 %r155, %r84, 144, %r80; | |
mad.lo.s32 %r156, %r1, 12, %r155; | |
mad.lo.s32 %r157, %r94, 144, %r90; | |
mad.lo.s32 %r158, %r2, 12, %r157; | |
mad.lo.s32 %r159, %r104, 144, %r100; | |
mad.lo.s32 %r160, %r3, 12, %r159; | |
mad.lo.s32 %r161, %r114, 144, %r110; | |
mad.lo.s32 %r162, %r4, 12, %r161; | |
mad.lo.s32 %r163, %r124, 144, %r120; | |
mad.lo.s32 %r164, %r5, 12, %r163; | |
mad.lo.s32 %r165, %r134, 144, %r130; | |
mad.lo.s32 %r166, %r6, 12, %r165; | |
mad.lo.s32 %r167, %r144, 144, %r140; | |
mad.lo.s32 %r168, %r7, 12, %r167; | |
mad.lo.s32 %r169, %r154, 144, %r150; | |
mad.lo.s32 %r170, %r8, 12, %r169; | |
mul.wide.s32 %rd30, %r156, 2; | |
add.s64 %rd10, %rd20, %rd30; | |
mul.wide.s32 %rd31, %r158, 2; | |
add.s64 %rd11, %rd20, %rd31; | |
mul.wide.s32 %rd32, %r160, 2; | |
add.s64 %rd12, %rd20, %rd32; | |
mul.wide.s32 %rd33, %r162, 2; | |
add.s64 %rd13, %rd20, %rd33; | |
mul.wide.s32 %rd34, %r164, 2; | |
add.s64 %rd14, %rd20, %rd34; | |
mul.wide.s32 %rd35, %r166, 2; | |
add.s64 %rd15, %rd20, %rd35; | |
mul.wide.s32 %rd36, %r168, 2; | |
add.s64 %rd16, %rd20, %rd36; | |
mul.wide.s32 %rd37, %r170, 2; | |
{add.s64 %rd17, %rd20, %rd37; | |
setp.lt.s32 %p10, %r14, 1; | |
@%p10 atom.global.gpu.add.noftz.f16 %rs1, [ %rd10 + 0 ], %rs2;} | |
{or.b32 %r171, %r14, 1; | |
setp.lt.s32 %p11, %r171, 1; | |
@%p11 atom.global.gpu.add.noftz.f16 %rs3, [ %rd11 + 0 ], %rs4;} | |
{or.b32 %r172, %r14, 2; | |
setp.lt.s32 %p12, %r172, 1; | |
@%p12 atom.global.gpu.add.noftz.f16 %rs5, [ %rd12 + 0 ], %rs6;} | |
{or.b32 %r173, %r14, 3; | |
setp.lt.s32 %p13, %r173, 1; | |
@%p13 atom.global.gpu.add.noftz.f16 %rs7, [ %rd13 + 0 ], %rs8;} | |
{or.b32 %r174, %r14, 4; | |
setp.lt.s32 %p14, %r174, 1; | |
@%p14 atom.global.gpu.add.noftz.f16 %rs9, [ %rd14 + 0 ], %rs10;} | |
{or.b32 %r175, %r14, 5; | |
setp.lt.s32 %p15, %r175, 1; | |
@%p15 atom.global.gpu.add.noftz.f16 %rs11, [ %rd15 + 0 ], %rs12;} | |
{or.b32 %r176, %r14, 6; | |
setp.lt.s32 %p16, %r176, 1; | |
@%p16 atom.global.gpu.add.noftz.f16 %rs13, [ %rd16 + 0 ], %rs14;} | |
{or.b32 %r177, %r14, 7; | |
setp.lt.s32 %p17, %r177, 1; | |
@%p17 atom.global.gpu.add.noftz.f16 %rs15, [ %rd17 + 0 ], %rs16;} | |
ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment