Skip to content

Instantly share code, notes, and snippets.

@Jokeren
Created August 17, 2024 16:22
Show Gist options
  • Save Jokeren/8e5f8fe123fe7c8a40bbe9ae99bc165b to your computer and use it in GitHub Desktop.
Save Jokeren/8e5f8fe123fe7c8a40bbe9ae99bc165b to your computer and use it in GitHub Desktop.
AMD vec problem
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target triple = "amdgcn-amd-amdhsa"
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
; Function Attrs: mustprogress nofree norecurse nounwind willreturn
define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 {
%3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
%4 = shl i32 %3, 2, !dbg !8
%5 = and i32 %4, 1792, !dbg !8
%6 = or disjoint i32 %5, 2048, !dbg !8
%7 = and i32 %3, 1, !dbg !9
%8 = and i32 %4, 252, !dbg !9
%9 = or disjoint i32 %8, 1, !dbg !9
%10 = or disjoint i32 %8, 2, !dbg !9
%11 = or disjoint i32 %8, 3, !dbg !9
%12 = and i32 %4, 2044, !dbg !10
%13 = or disjoint i32 %9, %5, !dbg !10
%14 = or disjoint i32 %10, %5, !dbg !10
%15 = or disjoint i32 %11, %5, !dbg !10
%16 = or disjoint i32 %6, %8, !dbg !10
%17 = or disjoint i32 %9, %6, !dbg !10
%18 = or disjoint i32 %10, %6, !dbg !10
%19 = or disjoint i32 %11, %6, !dbg !10
%20 = zext nneg i32 %12 to i64, !dbg !11
%21 = getelementptr i32, ptr addrspace(1) %0, i64 %20, !dbg !11
%22 = zext nneg i32 %16 to i64, !dbg !11
%23 = getelementptr i32, ptr addrspace(1) %0, i64 %22, !dbg !11
%24 = addrspacecast ptr addrspace(1) %21 to ptr, !dbg !12
%25 = load <4 x i32>, ptr %24, align 16, !dbg !12
%26 = addrspacecast ptr addrspace(1) %23 to ptr, !dbg !12
%27 = load <4 x i32>, ptr %26, align 16, !dbg !12
%28 = shl i32 %3, 3, !dbg !13
%29 = and i32 %28, 248, !dbg !13
%30 = or disjoint i32 %29, %5, !dbg !13
%31 = and i32 %28, 4088, !dbg !13
%32 = zext nneg i32 %31 to i64, !dbg !13
%33 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %32, !dbg !13
%34 = shufflevector <4 x i32> %25, <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg !13
store <2 x i32> %34, ptr addrspace(3) %33, align 16, !dbg !13
%35 = or disjoint i32 %31, 4, !dbg !13
%36 = zext nneg i32 %35 to i64, !dbg !13
%37 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %36, !dbg !13
%38 = shufflevector <4 x i32> %25, <4 x i32> poison, <2 x i32> <i32 2, i32 3>, !dbg !13
store <2 x i32> %38, ptr addrspace(3) %37, align 16, !dbg !13
fence syncscope("workgroup") release, !dbg !13
tail call void @llvm.amdgcn.s.barrier(), !dbg !13
fence syncscope("workgroup") acquire, !dbg !13
%39 = shl nuw nsw i32 %30, 1, !dbg !13
%40 = zext nneg i32 %39 to i64, !dbg !13
%41 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %40, !dbg !13
%42 = load i32, ptr addrspace(3) %41, align 16, !dbg !13
%43 = or disjoint i32 %39, 4, !dbg !13
%44 = zext nneg i32 %43 to i64, !dbg !13
%45 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %44, !dbg !13
%46 = load i32, ptr addrspace(3) %45, align 16, !dbg !13
%47 = or disjoint i32 %39, 8, !dbg !13
%48 = zext nneg i32 %47 to i64, !dbg !13
%49 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %48, !dbg !13
%50 = load i32, ptr addrspace(3) %49, align 16, !dbg !13
%51 = or disjoint i32 %39, 12, !dbg !13
%52 = zext nneg i32 %51 to i64, !dbg !13
%53 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %52, !dbg !13
%54 = load i32, ptr addrspace(3) %53, align 16, !dbg !13
%55 = or disjoint i32 %39, 1, !dbg !13
%56 = zext nneg i32 %55 to i64, !dbg !13
%57 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %56, !dbg !13
%58 = load i32, ptr addrspace(3) %57, align 4, !dbg !13
%59 = or disjoint i32 %39, 5, !dbg !13
%60 = zext nneg i32 %59 to i64, !dbg !13
%61 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %60, !dbg !13
%62 = load i32, ptr addrspace(3) %61, align 4, !dbg !13
%63 = or disjoint i32 %39, 9, !dbg !13
%64 = zext nneg i32 %63 to i64, !dbg !13
%65 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %64, !dbg !13
%66 = load i32, ptr addrspace(3) %65, align 4, !dbg !13
%67 = or disjoint i32 %39, 13, !dbg !13
%68 = zext nneg i32 %67 to i64, !dbg !13
%69 = getelementptr inbounds i32, ptr addrspace(3) @global_smem, i64 %68, !dbg !13
%70 = load i32, ptr addrspace(3) %69, align 4, !dbg !13
fence syncscope("workgroup") release, !dbg !13
tail call void @llvm.amdgcn.s.barrier(), !dbg !13
fence syncscope("workgroup") acquire, !dbg !13
%71 = shufflevector <4 x i32> %27, <4 x i32> poison, <2 x i32> <i32 0, i32 1>, !dbg !13
store <2 x i32> %71, ptr addrspace(3) %33, align 16, !dbg !13
%72 = shufflevector <4 x i32> %27, <4 x i32> poison, <2 x i32> <i32 2, i32 3>, !dbg !13
store <2 x i32> %72, ptr addrspace(3) %37, align 16, !dbg !13
fence syncscope("workgroup") release, !dbg !13
tail call void @llvm.amdgcn.s.barrier(), !dbg !13
fence syncscope("workgroup") acquire, !dbg !13
%73 = load i32, ptr addrspace(3) %41, align 16, !dbg !13
%74 = load i32, ptr addrspace(3) %45, align 16, !dbg !13
%75 = load i32, ptr addrspace(3) %49, align 16, !dbg !13
%76 = load i32, ptr addrspace(3) %53, align 16, !dbg !13
%77 = load i32, ptr addrspace(3) %57, align 4, !dbg !13
%78 = load i32, ptr addrspace(3) %61, align 4, !dbg !13
%79 = load i32, ptr addrspace(3) %65, align 4, !dbg !13
%80 = load i32, ptr addrspace(3) %69, align 4, !dbg !13
%81 = lshr i32 %3, 1, !dbg !17
%.lobit = and i32 %81, 1, !dbg !17
%82 = icmp ne i32 %7, %.lobit, !dbg !18
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%83 = zext i1 %82 to i8, !dbg !19
%84 = and i32 %3, 3, !dbg !19
%85 = shl nuw nsw i32 %84, 1, !dbg !19
%86 = zext nneg i32 %85 to i64, !dbg !19
%87 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %86, !dbg !19
%88 = insertelement <1 x i8> poison, i8 %83, i64 0, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%89 = lshr i32 %3, 3, !dbg !19
%90 = and i32 %89, 6, !dbg !19
%91 = zext nneg i32 %90 to i64, !dbg !19
%92 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %91, !dbg !19
%93 = load <8 x i1>, ptr addrspace(3) %92, align 2, !dbg !19
%94 = extractelement <8 x i1> %93, i64 0, !dbg !19
%95 = select i1 %94, i32 %42, i32 0, !dbg !19
%96 = select i1 %94, i32 %46, i32 0, !dbg !19
%97 = select i1 %94, i32 %50, i32 0, !dbg !19
%98 = select i1 %94, i32 %54, i32 0, !dbg !19
%99 = select i1 %94, i32 %58, i32 0, !dbg !19
%100 = select i1 %94, i32 %62, i32 0, !dbg !19
%101 = select i1 %94, i32 %66, i32 0, !dbg !19
%102 = select i1 %94, i32 %70, i32 0, !dbg !19
%103 = select i1 %94, i32 %73, i32 0, !dbg !19
%104 = select i1 %94, i32 %74, i32 0, !dbg !19
%105 = select i1 %94, i32 %75, i32 0, !dbg !19
%106 = select i1 %94, i32 %76, i32 0, !dbg !19
%107 = select i1 %94, i32 %77, i32 0, !dbg !19
%108 = select i1 %94, i32 %78, i32 0, !dbg !19
%109 = select i1 %94, i32 %79, i32 0, !dbg !19
%110 = select i1 %94, i32 %80, i32 0, !dbg !19
%111 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %95, i32 16415), !dbg !20
%112 = add i32 %95, %111, !dbg !22
%113 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %99, i32 16415), !dbg !20
%114 = add i32 %99, %113, !dbg !22
%115 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %96, i32 16415), !dbg !20
%116 = add i32 %115, %96, !dbg !22
%117 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %100, i32 16415), !dbg !20
%118 = add i32 %117, %100, !dbg !22
%119 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %97, i32 16415), !dbg !20
%120 = add i32 %119, %97, !dbg !22
%121 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %101, i32 16415), !dbg !20
%122 = add i32 %121, %101, !dbg !22
%123 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %98, i32 16415), !dbg !20
%124 = add i32 %123, %98, !dbg !22
%125 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %102, i32 16415), !dbg !20
%126 = add i32 %125, %102, !dbg !22
%127 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %103, i32 16415), !dbg !20
%128 = add i32 %127, %103, !dbg !22
%129 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %107, i32 16415), !dbg !20
%130 = add i32 %129, %107, !dbg !22
%131 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %104, i32 16415), !dbg !20
%132 = add i32 %131, %104, !dbg !22
%133 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %108, i32 16415), !dbg !20
%134 = add i32 %133, %108, !dbg !22
%135 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %105, i32 16415), !dbg !20
%136 = add i32 %135, %105, !dbg !22
%137 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %109, i32 16415), !dbg !20
%138 = add i32 %137, %109, !dbg !22
%139 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %106, i32 16415), !dbg !20
%140 = add i32 %139, %106, !dbg !22
%141 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %110, i32 16415), !dbg !20
%142 = add i32 %141, %110, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%143 = lshr i32 %3, 2, !dbg !19
%144 = and i32 %143, 6, !dbg !19
%145 = zext nneg i32 %144 to i64, !dbg !19
%146 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %145, !dbg !19
%147 = load <8 x i1>, ptr addrspace(3) %146, align 2, !dbg !19
%148 = extractelement <8 x i1> %147, i64 0, !dbg !19
%149 = select i1 %148, i32 %112, i32 0, !dbg !19
%150 = select i1 %148, i32 %116, i32 0, !dbg !19
%151 = select i1 %148, i32 %120, i32 0, !dbg !19
%152 = select i1 %148, i32 %124, i32 0, !dbg !19
%153 = select i1 %148, i32 %114, i32 0, !dbg !19
%154 = select i1 %148, i32 %118, i32 0, !dbg !19
%155 = select i1 %148, i32 %122, i32 0, !dbg !19
%156 = select i1 %148, i32 %126, i32 0, !dbg !19
%157 = select i1 %148, i32 %128, i32 0, !dbg !19
%158 = select i1 %148, i32 %132, i32 0, !dbg !19
%159 = select i1 %148, i32 %136, i32 0, !dbg !19
%160 = select i1 %148, i32 %140, i32 0, !dbg !19
%161 = select i1 %148, i32 %130, i32 0, !dbg !19
%162 = select i1 %148, i32 %134, i32 0, !dbg !19
%163 = select i1 %148, i32 %138, i32 0, !dbg !19
%164 = select i1 %148, i32 %142, i32 0, !dbg !19
%165 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %149, i32 8223), !dbg !20
%166 = add i32 %149, %165, !dbg !22
%167 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %153, i32 8223), !dbg !20
%168 = add i32 %153, %167, !dbg !22
%169 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %150, i32 8223), !dbg !20
%170 = add i32 %169, %150, !dbg !22
%171 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %154, i32 8223), !dbg !20
%172 = add i32 %171, %154, !dbg !22
%173 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %151, i32 8223), !dbg !20
%174 = add i32 %173, %151, !dbg !22
%175 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %155, i32 8223), !dbg !20
%176 = add i32 %175, %155, !dbg !22
%177 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %152, i32 8223), !dbg !20
%178 = add i32 %177, %152, !dbg !22
%179 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %156, i32 8223), !dbg !20
%180 = add i32 %179, %156, !dbg !22
%181 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %157, i32 8223), !dbg !20
%182 = add i32 %181, %157, !dbg !22
%183 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %161, i32 8223), !dbg !20
%184 = add i32 %183, %161, !dbg !22
%185 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %158, i32 8223), !dbg !20
%186 = add i32 %185, %158, !dbg !22
%187 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %162, i32 8223), !dbg !20
%188 = add i32 %187, %162, !dbg !22
%189 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %159, i32 8223), !dbg !20
%190 = add i32 %189, %159, !dbg !22
%191 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %163, i32 8223), !dbg !20
%192 = add i32 %191, %163, !dbg !22
%193 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %160, i32 8223), !dbg !20
%194 = add i32 %193, %160, !dbg !22
%195 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %164, i32 8223), !dbg !20
%196 = add i32 %195, %164, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%197 = and i32 %81, 6, !dbg !19
%198 = zext nneg i32 %197 to i64, !dbg !19
%199 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %198, !dbg !19
%200 = load <8 x i1>, ptr addrspace(3) %199, align 2, !dbg !19
%201 = extractelement <8 x i1> %200, i64 0, !dbg !19
%202 = select i1 %201, i32 %166, i32 0, !dbg !19
%203 = select i1 %201, i32 %170, i32 0, !dbg !19
%204 = select i1 %201, i32 %174, i32 0, !dbg !19
%205 = select i1 %201, i32 %178, i32 0, !dbg !19
%206 = select i1 %201, i32 %168, i32 0, !dbg !19
%207 = select i1 %201, i32 %172, i32 0, !dbg !19
%208 = select i1 %201, i32 %176, i32 0, !dbg !19
%209 = select i1 %201, i32 %180, i32 0, !dbg !19
%210 = select i1 %201, i32 %182, i32 0, !dbg !19
%211 = select i1 %201, i32 %186, i32 0, !dbg !19
%212 = select i1 %201, i32 %190, i32 0, !dbg !19
%213 = select i1 %201, i32 %194, i32 0, !dbg !19
%214 = select i1 %201, i32 %184, i32 0, !dbg !19
%215 = select i1 %201, i32 %188, i32 0, !dbg !19
%216 = select i1 %201, i32 %192, i32 0, !dbg !19
%217 = select i1 %201, i32 %196, i32 0, !dbg !19
%218 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %202, i32 4127), !dbg !20
%219 = add i32 %202, %218, !dbg !22
%220 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %206, i32 4127), !dbg !20
%221 = add i32 %206, %220, !dbg !22
%222 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %203, i32 4127), !dbg !20
%223 = add i32 %222, %203, !dbg !22
%224 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %207, i32 4127), !dbg !20
%225 = add i32 %224, %207, !dbg !22
%226 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %204, i32 4127), !dbg !20
%227 = add i32 %226, %204, !dbg !22
%228 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %208, i32 4127), !dbg !20
%229 = add i32 %228, %208, !dbg !22
%230 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %205, i32 4127), !dbg !20
%231 = add i32 %230, %205, !dbg !22
%232 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %209, i32 4127), !dbg !20
%233 = add i32 %232, %209, !dbg !22
%234 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %210, i32 4127), !dbg !20
%235 = add i32 %234, %210, !dbg !22
%236 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %214, i32 4127), !dbg !20
%237 = add i32 %236, %214, !dbg !22
%238 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %211, i32 4127), !dbg !20
%239 = add i32 %238, %211, !dbg !22
%240 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %215, i32 4127), !dbg !20
%241 = add i32 %240, %215, !dbg !22
%242 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %212, i32 4127), !dbg !20
%243 = add i32 %242, %212, !dbg !22
%244 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %216, i32 4127), !dbg !20
%245 = add i32 %244, %216, !dbg !22
%246 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %213, i32 4127), !dbg !20
%247 = add i32 %246, %213, !dbg !22
%248 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %217, i32 4127), !dbg !20
%249 = add i32 %248, %217, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%250 = shl nuw nsw i32 %.lobit, 1, !dbg !19
%.mask = and i32 %3, 4, !dbg !19
%251 = or disjoint i32 %250, %.mask, !dbg !19
%252 = zext nneg i32 %251 to i64, !dbg !19
%253 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %252, !dbg !19
%254 = load <8 x i1>, ptr addrspace(3) %253, align 2, !dbg !19
%255 = extractelement <8 x i1> %254, i64 0, !dbg !19
%256 = select i1 %255, i32 %219, i32 0, !dbg !19
%257 = select i1 %255, i32 %223, i32 0, !dbg !19
%258 = select i1 %255, i32 %227, i32 0, !dbg !19
%259 = select i1 %255, i32 %231, i32 0, !dbg !19
%260 = select i1 %255, i32 %221, i32 0, !dbg !19
%261 = select i1 %255, i32 %225, i32 0, !dbg !19
%262 = select i1 %255, i32 %229, i32 0, !dbg !19
%263 = select i1 %255, i32 %233, i32 0, !dbg !19
%264 = select i1 %255, i32 %235, i32 0, !dbg !19
%265 = select i1 %255, i32 %239, i32 0, !dbg !19
%266 = select i1 %255, i32 %243, i32 0, !dbg !19
%267 = select i1 %255, i32 %247, i32 0, !dbg !19
%268 = select i1 %255, i32 %237, i32 0, !dbg !19
%269 = select i1 %255, i32 %241, i32 0, !dbg !19
%270 = select i1 %255, i32 %245, i32 0, !dbg !19
%271 = select i1 %255, i32 %249, i32 0, !dbg !19
%272 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %256, i32 2079), !dbg !20
%273 = add i32 %256, %272, !dbg !22
%274 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %260, i32 2079), !dbg !20
%275 = add i32 %260, %274, !dbg !22
%276 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %257, i32 2079), !dbg !20
%277 = add i32 %276, %257, !dbg !22
%278 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %261, i32 2079), !dbg !20
%279 = add i32 %278, %261, !dbg !22
%280 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %258, i32 2079), !dbg !20
%281 = add i32 %280, %258, !dbg !22
%282 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %262, i32 2079), !dbg !20
%283 = add i32 %282, %262, !dbg !22
%284 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %259, i32 2079), !dbg !20
%285 = add i32 %284, %259, !dbg !22
%286 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %263, i32 2079), !dbg !20
%287 = add i32 %286, %263, !dbg !22
%288 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %264, i32 2079), !dbg !20
%289 = add i32 %288, %264, !dbg !22
%290 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %268, i32 2079), !dbg !20
%291 = add i32 %290, %268, !dbg !22
%292 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %265, i32 2079), !dbg !20
%293 = add i32 %292, %265, !dbg !22
%294 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %269, i32 2079), !dbg !20
%295 = add i32 %294, %269, !dbg !22
%296 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %266, i32 2079), !dbg !20
%297 = add i32 %296, %266, !dbg !22
%298 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %270, i32 2079), !dbg !20
%299 = add i32 %298, %270, !dbg !22
%300 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %267, i32 2079), !dbg !20
%301 = add i32 %300, %267, !dbg !22
%302 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %271, i32 2079), !dbg !20
%303 = add i32 %302, %271, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%304 = load <8 x i1>, ptr addrspace(3) %87, align 2, !dbg !19
%305 = extractelement <8 x i1> %304, i64 0, !dbg !19
%306 = select i1 %305, i32 %273, i32 0, !dbg !19
%307 = select i1 %305, i32 %277, i32 0, !dbg !19
%308 = select i1 %305, i32 %281, i32 0, !dbg !19
%309 = select i1 %305, i32 %285, i32 0, !dbg !19
%310 = select i1 %305, i32 %275, i32 0, !dbg !19
%311 = select i1 %305, i32 %279, i32 0, !dbg !19
%312 = select i1 %305, i32 %283, i32 0, !dbg !19
%313 = select i1 %305, i32 %287, i32 0, !dbg !19
%314 = select i1 %305, i32 %289, i32 0, !dbg !19
%315 = select i1 %305, i32 %293, i32 0, !dbg !19
%316 = select i1 %305, i32 %297, i32 0, !dbg !19
%317 = select i1 %305, i32 %301, i32 0, !dbg !19
%318 = select i1 %305, i32 %291, i32 0, !dbg !19
%319 = select i1 %305, i32 %295, i32 0, !dbg !19
%320 = select i1 %305, i32 %299, i32 0, !dbg !19
%321 = select i1 %305, i32 %303, i32 0, !dbg !19
%322 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %306, i32 1055), !dbg !20
%323 = add i32 %306, %322, !dbg !22
%324 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %310, i32 1055), !dbg !20
%325 = add i32 %310, %324, !dbg !22
%326 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %307, i32 1055), !dbg !20
%327 = add i32 %326, %307, !dbg !22
%328 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %311, i32 1055), !dbg !20
%329 = add i32 %328, %311, !dbg !22
%330 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %308, i32 1055), !dbg !20
%331 = add i32 %330, %308, !dbg !22
%332 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %312, i32 1055), !dbg !20
%333 = add i32 %332, %312, !dbg !22
%334 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %309, i32 1055), !dbg !20
%335 = add i32 %334, %309, !dbg !22
%336 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %313, i32 1055), !dbg !20
%337 = add i32 %336, %313, !dbg !22
%338 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %314, i32 1055), !dbg !20
%339 = add i32 %338, %314, !dbg !22
%340 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %318, i32 1055), !dbg !20
%341 = add i32 %340, %318, !dbg !22
%342 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %315, i32 1055), !dbg !20
%343 = add i32 %342, %315, !dbg !22
%344 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %319, i32 1055), !dbg !20
%345 = add i32 %344, %319, !dbg !22
%346 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %316, i32 1055), !dbg !20
%347 = add i32 %346, %316, !dbg !22
%348 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %320, i32 1055), !dbg !20
%349 = add i32 %348, %320, !dbg !22
%350 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %317, i32 1055), !dbg !20
%351 = add i32 %350, %317, !dbg !22
%352 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %321, i32 1055), !dbg !20
%353 = add i32 %352, %321, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%354 = shl nuw nsw i32 %7, 2, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %87, align 2, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%355 = zext nneg i32 %354 to i64, !dbg !19
%356 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %355, !dbg !19
%357 = or disjoint i32 %354, 2, !dbg !19
%358 = zext nneg i32 %357 to i64, !dbg !19
%359 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %358, !dbg !19
%360 = load <8 x i1>, ptr addrspace(3) %356, align 4, !dbg !19
%361 = load <8 x i1>, ptr addrspace(3) %359, align 2, !dbg !19
%362 = extractelement <8 x i1> %360, i64 0, !dbg !19
%363 = extractelement <8 x i1> %361, i64 0, !dbg !19
%364 = select i1 %362, i32 %323, i32 0, !dbg !19
%365 = select i1 %362, i32 %327, i32 0, !dbg !19
%366 = select i1 %363, i32 %331, i32 0, !dbg !19
%367 = select i1 %363, i32 %335, i32 0, !dbg !19
%368 = select i1 %362, i32 %325, i32 0, !dbg !19
%369 = select i1 %362, i32 %329, i32 0, !dbg !19
%370 = select i1 %363, i32 %333, i32 0, !dbg !19
%371 = select i1 %363, i32 %337, i32 0, !dbg !19
%372 = select i1 %362, i32 %339, i32 0, !dbg !19
%373 = select i1 %362, i32 %343, i32 0, !dbg !19
%374 = select i1 %363, i32 %347, i32 0, !dbg !19
%375 = select i1 %363, i32 %351, i32 0, !dbg !19
%376 = select i1 %362, i32 %341, i32 0, !dbg !19
%377 = select i1 %362, i32 %345, i32 0, !dbg !19
%378 = select i1 %363, i32 %349, i32 0, !dbg !19
%379 = select i1 %363, i32 %353, i32 0, !dbg !19
%380 = add i32 %366, %364, !dbg !22
%381 = add i32 %367, %365, !dbg !22
%382 = add i32 %370, %368, !dbg !22
%383 = add i32 %371, %369, !dbg !22
%384 = add i32 %374, %372, !dbg !22
%385 = add i32 %375, %373, !dbg !22
%386 = add i32 %378, %376, !dbg !22
%387 = add i32 %379, %377, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%388 = lshr i32 %84, 1, !dbg !19
%389 = and i32 %3, 2, !dbg !19
%390 = add nuw nsw i32 %389, %84, !dbg !19
%391 = zext nneg i32 %390 to i64, !dbg !19
%392 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %391, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %392, align 1, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%393 = load <16 x i1>, ptr addrspace(3) @global_smem, align 16, !dbg !19
%394 = load <16 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 4), align 4, !dbg !19
%395 = extractelement <16 x i1> %393, i64 0, !dbg !19
%396 = extractelement <16 x i1> %393, i64 8, !dbg !19
%397 = extractelement <16 x i1> %394, i64 0, !dbg !19
%398 = extractelement <16 x i1> %394, i64 8, !dbg !19
%399 = select i1 %395, i32 %380, i32 0, !dbg !19
%400 = select i1 %396, i32 %381, i32 0, !dbg !19
%401 = select i1 %397, i32 %380, i32 0, !dbg !19
%402 = select i1 %398, i32 %381, i32 0, !dbg !19
%403 = select i1 %395, i32 %382, i32 0, !dbg !19
%404 = select i1 %396, i32 %383, i32 0, !dbg !19
%405 = select i1 %397, i32 %382, i32 0, !dbg !19
%406 = select i1 %398, i32 %383, i32 0, !dbg !19
%407 = select i1 %395, i32 %384, i32 0, !dbg !19
%408 = select i1 %396, i32 %385, i32 0, !dbg !19
%409 = select i1 %397, i32 %384, i32 0, !dbg !19
%410 = select i1 %398, i32 %385, i32 0, !dbg !19
%411 = select i1 %395, i32 %386, i32 0, !dbg !19
%412 = select i1 %396, i32 %387, i32 0, !dbg !19
%413 = select i1 %397, i32 %386, i32 0, !dbg !19
%414 = select i1 %398, i32 %387, i32 0, !dbg !19
%415 = add i32 %399, %400, !dbg !22
%416 = add i32 %401, %402, !dbg !22
%417 = add i32 %403, %404, !dbg !22
%418 = add i32 %405, %406, !dbg !22
%419 = add i32 %407, %408, !dbg !22
%420 = add i32 %409, %410, !dbg !22
%421 = add i32 %411, %412, !dbg !22
%422 = add i32 %413, %414, !dbg !22
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%423 = add nuw nsw i32 %388, %84, !dbg !19
%424 = zext nneg i32 %423 to i64, !dbg !19
%425 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i64 %424, !dbg !19
store <1 x i8> %88, ptr addrspace(3) %425, align 1, !dbg !19
fence syncscope("workgroup") release, !dbg !19
tail call void @llvm.amdgcn.s.barrier(), !dbg !19
fence syncscope("workgroup") acquire, !dbg !19
%426 = load <8 x i1>, ptr addrspace(3) @global_smem, align 16, !dbg !19
%427 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 3), align 1, !dbg !19
%428 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 1), align 1, !dbg !19
%429 = load <8 x i1>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @global_smem, i64 4), align 4, !dbg !19
%430 = extractelement <8 x i1> %426, i64 0, !dbg !19
%431 = extractelement <8 x i1> %427, i64 0, !dbg !19
%432 = extractelement <8 x i1> %428, i64 0, !dbg !19
%433 = extractelement <8 x i1> %429, i64 0, !dbg !19
%434 = select i1 %430, i32 %415, i32 0, !dbg !19
%435 = select i1 %431, i32 %415, i32 0, !dbg !19
%436 = select i1 %430, i32 %416, i32 0, !dbg !19
%437 = select i1 %431, i32 %416, i32 0, !dbg !19
%438 = select i1 %432, i32 %417, i32 0, !dbg !19
%439 = select i1 %433, i32 %417, i32 0, !dbg !19
%440 = select i1 %432, i32 %418, i32 0, !dbg !19
%441 = select i1 %433, i32 %418, i32 0, !dbg !19
%442 = select i1 %430, i32 %419, i32 0, !dbg !19
%443 = select i1 %431, i32 %419, i32 0, !dbg !19
%444 = select i1 %430, i32 %420, i32 0, !dbg !19
%445 = select i1 %431, i32 %420, i32 0, !dbg !19
%446 = select i1 %432, i32 %421, i32 0, !dbg !19
%447 = select i1 %433, i32 %421, i32 0, !dbg !19
%448 = select i1 %432, i32 %422, i32 0, !dbg !19
%449 = select i1 %433, i32 %422, i32 0, !dbg !19
%450 = add i32 %438, %434, !dbg !22
%451 = add i32 %439, %435, !dbg !22
%452 = add i32 %440, %436, !dbg !22
%453 = add i32 %441, %437, !dbg !22
%454 = add i32 %446, %442, !dbg !22
%455 = add i32 %447, %443, !dbg !22
%456 = add i32 %448, %444, !dbg !22
%457 = add i32 %449, %445, !dbg !22
%458 = getelementptr i32, ptr addrspace(1) %1, i64 %20, !dbg !24
%459 = zext nneg i32 %13 to i64, !dbg !24
%460 = getelementptr i32, ptr addrspace(1) %1, i64 %459, !dbg !24
%461 = zext nneg i32 %14 to i64, !dbg !24
%462 = getelementptr i32, ptr addrspace(1) %1, i64 %461, !dbg !24
%463 = zext nneg i32 %15 to i64, !dbg !24
%464 = getelementptr i32, ptr addrspace(1) %1, i64 %463, !dbg !24
%465 = getelementptr i32, ptr addrspace(1) %1, i64 %22, !dbg !24
%466 = zext nneg i32 %17 to i64, !dbg !24
%467 = getelementptr i32, ptr addrspace(1) %1, i64 %466, !dbg !24
%468 = zext nneg i32 %18 to i64, !dbg !24
%469 = getelementptr i32, ptr addrspace(1) %1, i64 %468, !dbg !24
%470 = zext nneg i32 %19 to i64, !dbg !24
%471 = getelementptr i32, ptr addrspace(1) %1, i64 %470, !dbg !24
store i32 %450, ptr addrspace(1) %458, align 4, !dbg !25
store i32 %451, ptr addrspace(1) %460, align 4, !dbg !25
store i32 %452, ptr addrspace(1) %462, align 4, !dbg !25
store i32 %453, ptr addrspace(1) %464, align 4, !dbg !25
store i32 %454, ptr addrspace(1) %465, align 4, !dbg !25
store i32 %455, ptr addrspace(1) %467, align 4, !dbg !25
store i32 %456, ptr addrspace(1) %469, align 4, !dbg !25
store i32 %457, ptr addrspace(1) %471, align 4, !dbg !25
ret void, !dbg !26
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.s.barrier() #2
; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn memory(none)
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32 immarg) #3
attributes #0 = { mustprogress nofree norecurse nounwind willreturn "amdgpu-flat-work-group-size"="1,512" "amdgpu-waves-per-eu"="1" "denormal-fp-math-f32"="ieee" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent mustprogress nocallback nofree nounwind willreturn }
attributes #3 = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
!3 = !DIFile(filename: "test_standard.py", directory: "/home/openai/triton/python/test/unit/language")
!4 = distinct !DISubprogram(name: "flip_kernel", linkageName: "flip_kernel", scope: !3, file: !3, line: 62, type: !5, scopeLine: 62, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
!6 = !{}
!7 = !DILocation(line: 64, column: 24, scope: !4)
!8 = !DILocation(line: 64, column: 29, scope: !4)
!9 = !DILocation(line: 65, column: 17, scope: !4)
!10 = !DILocation(line: 65, column: 28, scope: !4)
!11 = !DILocation(line: 66, column: 20, scope: !4)
!12 = !DILocation(line: 66, column: 16, scope: !4)
!13 = !DILocation(line: 420, column: 28, scope: !14, inlinedAt: !16)
!14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
!15 = !DIFile(filename: "standard.py", directory: "/home/openai/triton/python/triton/language")
!16 = !DILocation(line: 67, column: 16, scope: !4)
!17 = !DILocation(line: 421, column: 30, scope: !14, inlinedAt: !16)
!18 = !DILocation(line: 421, column: 42, scope: !14, inlinedAt: !16)
!19 = !DILocation(line: 427, column: 20, scope: !14, inlinedAt: !16)
!20 = !DILocation(line: 267, column: 36, scope: !21, inlinedAt: !16)
!21 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
!22 = !DILocation(line: 256, column: 15, scope: !23, inlinedAt: !16)
!23 = distinct !DILexicalBlockFile(scope: !21, file: !15, discriminator: 0)
!24 = !DILocation(line: 68, column: 17, scope: !4)
!25 = !DILocation(line: 68, column: 24, scope: !4)
!26 = !DILocation(line: 68, column: 4, scope: !4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment