Created
August 20, 2020 05:47
-
-
Save MasonProtter/8f8ae49ff63eb6f1e1f19e11fb2a3ebe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#+BEGIN_SRC jupyter-julia | |
using CUDA | |
struct StaticString{N} <: AbstractString | |
chars::NTuple{N, Char} | |
end | |
macro s_str(s) | |
chars = tuple(collect(s)...) | |
N = length(chars) | |
esc(:(StaticString{$N}($chars))) | |
end | |
Base.String(s::StaticString) = String(collect(s.chars)) | |
Base.show(io::IO, s::StaticString{N}) where {N} = print(io, "StaticString{$N}(\"$(String(s))\")") | |
Base.:(*)(s1::StaticString{N}, s2::StaticString{M}) where {N, M} = StaticString{N + M}((s1.chars..., s2.chars...)) | |
@device_code_sass cu([s"abc ", s"123 "]) .* cu([s"hello ", s"goodbye"]) | |
#+END_SRC | |
#+RESULTS: | |
#+BEGIN_EXAMPLE | |
// PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceArray{StaticString{11},1,CUDA.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},typeof(*),Tuple{Base.Broadcast.Extruded{CuDeviceArray{StaticString{4},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{StaticString{7},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}}}}, Int64) for sm_75 | |
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)" | |
.elftype @"ET_EXEC" | |
//--------------------- .text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 -------------------------- | |
.section .text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,"ax",@progbits | |
.sectioninfo @"SHI_REGISTERS=40" | |
.align 128 | |
.global _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 | |
.type _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,@function | |
.size _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,(.L_27 - _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64) | |
.other _Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64,@"STO_CUDA_ENTRY STV_DEFAULT" | |
_Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64: | |
.text._Z27julia_broadcast_kernel_386615CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64: | |
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; | |
IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x1b8] ; | |
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1bc] ; | |
ISETP.GE.U32.AND P0, PT, R0, 0x1, PT ; | |
ISETP.GE.AND.EX P0, PT, R2, RZ, PT, P0 ; | |
@!P0 EXIT ; | |
S2R R6, SR_TID.X ; | |
ULDC.U8 UR4, c[0x0][0x180] ; | |
ISETP.LT.U32.AND P0, PT, RZ, c[0x0][0x1b8], PT ; | |
ULOP3.LUT UR4, UR4, 0xff, URZ, 0xc0, !UP7 ; | |
S2R R9, SR_CTAID.X ; | |
ULDC.U8 UR6, c[0x0][0x1a0] ; | |
ISETP.GT.AND.EX P0, PT, R2, RZ, PT, P0 ; | |
ULOP3.LUT UR6, UR6, 0xff, URZ, 0xc0, !UP7 ; | |
ISETP.NE.AND P1, PT, RZ, UR4, PT ; | |
SEL R0, RZ, c[0x0][0x1b8], !P0 ; | |
SEL R2, RZ, c[0x0][0x1bc], !P0 ; | |
IADD3 R6, R6, 0x1, RZ ; | |
@!P1 BRA `(.L_2) ; | |
IMAD.MOV.U32 R7, RZ, RZ, RZ ; | |
UMOV UR4, URZ ; | |
IMAD.MOV.U32 R8, RZ, RZ, c[0x0][0x168] ; | |
ISETP.NE.AND P1, PT, RZ, UR6, PT ; | |
IMAD.WIDE.U32 R6, R9, c[0x0][0x0], R6 ; | |
MOV R9, c[0x0][0x16c] ; | |
ISETP.GT.U32.AND P0, PT, R6.reuse, c[0x0][0x160], PT ; | |
IADD3 R16, R7, UR4, RZ ; | |
IADD3 R3, P2, R6, -0x1, RZ ; | |
ISETP.GT.AND.EX P0, PT, R16.reuse, c[0x0][0x164], PT, P0 ; | |
LEA R10, P3, R3.reuse, c[0x0][0x178], 0x4 ; | |
IMAD.WIDE.U32 R8, R3, 0x2c, R8 ; | |
IADD3.X R4, R16, -0x1, RZ, P2, !PT ; | |
@P0 EXIT ; | |
IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x0] ; | |
LEA.HI.X R7, R3, c[0x0][0x17c], R4, 0x4, P3 ; | |
IMAD R31, R4, 0x2c, RZ ; | |
UMOV UR4, URZ ; | |
IMAD.WIDE.U32 R4, R5, c[0x0][0xc], RZ ; | |
IADD3 R14, P2, R8, 0x14, RZ ; | |
IADD3 R10, P0, R10, 0x8, RZ ; | |
IMAD.MOV.U32 R33, RZ, RZ, R6 ; | |
IMAD.X R31, R9, 0x1, R31, P2 ; | |
IADD3 R37, R5, UR4, RZ ; | |
IMAD.X R7, RZ, RZ, R7, P0 ; | |
SHF.L.U64.HI R35, R4, 0x4, R37 ; | |
.L_3: | |
SEL R3, R33, c[0x0][0x1a8], P1 ; | |
IMAD.MOV.U32 R9, RZ, RZ, c[0x0][0x19c] ; | |
MOV R8, c[0x0][0x198] ; | |
SEL R6, R16, c[0x0][0x1ac], P1 ; | |
IMAD.WIDE.U32 R8, R3, 0x1c, R8 ; | |
IMAD R11, R6, 0x1c, RZ ; | |
IMAD.MOV.U32 R6, RZ, RZ, R10 ; | |
IMAD.IADD R9, R9, 0x1, R11 ; | |
LDG.E.SYS R3, [R6+-0x8] ; | |
LDG.E.SYS R11, [R6+-0x4] ; | |
LDG.E.SYS R13, [R6] ; | |
LDG.E.SYS R15, [R6+0x4] ; | |
LDG.E.SYS R17, [R8+-0x1c] ; | |
LDG.E.SYS R19, [R8+-0x18] ; | |
LDG.E.SYS R21, [R8+-0x14] ; | |
LDG.E.SYS R23, [R8+-0x10] ; | |
LDG.E.SYS R25, [R8+-0xc] ; | |
LDG.E.SYS R27, [R8+-0x8] ; | |
LDG.E.SYS R29, [R8+-0x4] ; | |
IADD3 R10, P0, R0, -0x1, RZ ; | |
IADD3 R0, P2, R33, R4, RZ ; | |
IADD3.X R12, R2, -0x1, RZ, P0, !PT ; | |
ISETP.NE.U32.AND P0, PT, R10, RZ, PT ; | |
IMAD.X R2, R16, 0x1, R37, P2 ; | |
MOV R9, R31 ; | |
IMAD.MOV.U32 R8, RZ, RZ, R14 ; | |
ISETP.NE.AND.EX P0, PT, R12, RZ, PT, P0 ; | |
STG.E.SYS [R8+-0x14], R3 ; | |
STG.E.SYS [R8+-0x10], R11 ; | |
STG.E.SYS [R8+-0xc], R13 ; | |
LEA R3, P2, R4, R6, 0x4 ; | |
STG.E.SYS [R8+-0x8], R15 ; | |
STG.E.SYS [R8+-0x4], R17 ; | |
STG.E.SYS [R8], R19 ; | |
STG.E.SYS [R8+0x4], R21 ; | |
STG.E.SYS [R8+0x8], R23 ; | |
STG.E.SYS [R8+0xc], R25 ; | |
STG.E.SYS [R8+0x10], R27 ; | |
STG.E.SYS [R8+0x14], R29 ; | |
@!P0 EXIT ; | |
IMAD.MOV.U32 R33, RZ, RZ, R0 ; | |
IMAD.MOV.U32 R16, RZ, RZ, R2 ; | |
IMAD.WIDE.U32 R8, R4, 0x2c, R8 ; | |
ISETP.GT.U32.AND P0, PT, R33, c[0x0][0x160], PT ; | |
IMAD R31, R37, 0x2c, RZ ; | |
ISETP.GT.AND.EX P0, PT, R16, c[0x0][0x164], PT, P0 ; | |
IMAD.X R6, R7, 0x1, R35, P2 ; | |
IMAD.MOV.U32 R0, RZ, RZ, R10 ; | |
IADD3 R31, R9, R31, RZ ; | |
IMAD.MOV.U32 R14, RZ, RZ, R8 ; | |
MOV R7, R6 ; | |
IMAD.MOV.U32 R2, RZ, RZ, R12 ; | |
IMAD.MOV.U32 R10, RZ, RZ, R3 ; | |
@!P0 BRA `(.L_3) ; | |
EXIT ; | |
.L_2: | |
IMAD.MOV.U32 R7, RZ, RZ, RZ ; | |
UMOV UR8, URZ ; | |
ISETP.NE.AND P1, PT, RZ, UR6, PT ; | |
ULDC UR5, c[0x0][0x188] ; | |
IMAD.WIDE.U32 R8, R9, c[0x0][0x0], R6 ; | |
ULDC UR4, c[0x0][0x178] ; | |
ULEA UR4, UP0, UR5, UR4, 0x4 ; | |
IMAD.MOV.U32 R6, RZ, RZ, c[0x0][0x168] ; | |
ULDC UR7, c[0x0][0x18c] ; | |
IMAD.MOV.U32 R7, RZ, RZ, c[0x0][0x16c] ; | |
ISETP.GT.U32.AND P0, PT, R8, c[0x0][0x160], PT ; | |
IADD3 R12, R9, UR8, RZ ; | |
ULDC UR8, c[0x0][0x17c] ; | |
IMAD.WIDE.U32 R6, R8, 0x2c, R6 ; | |
ULEA.HI.X UR5, UR5, UR8, UR7, 0x4, UP0 ; | |
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x164], PT, P0 ; | |
@P0 EXIT ; | |
IMAD.MOV.U32 R4, RZ, RZ, c[0x0][0x0] ; | |
IADD3 R10, P0, R6, -0x18, RZ ; | |
IMAD R29, R12, 0x2c, RZ ; | |
BMOV.32.CLEAR RZ, B0 ; | |
IMAD.WIDE.U32 R4, R4, c[0x0][0xc], RZ ; | |
UMOV UR6, URZ ; | |
BSSY B0, `(.L_4) ; | |
IADD3.X R29, R7, -0x1, R29, P0, !PT ; | |
MOV R31, R8 ; | |
IADD3 R33, R5, UR6, RZ ; | |
.L_5: | |
SEL R3, R31, c[0x0][0x1a8], P1 ; | |
IMAD.MOV.U32 R6, RZ, RZ, c[0x0][0x198] ; | |
SEL R8, R12, c[0x0][0x1ac], P1 ; | |
IMAD.MOV.U32 R7, RZ, RZ, c[0x0][0x19c] ; | |
LDG.E.SYS R11, [UR4+-0x8] ; | |
IMAD.WIDE.U32 R6, R3, 0x1c, R6 ; | |
LDG.E.SYS R13, [UR4+-0x4] ; | |
IMAD R9, R8, 0x1c, RZ ; | |
LDG.E.SYS R3, [UR4+-0x10] ; | |
IMAD.IADD R7, R7, 0x1, R9 ; | |
LDG.E.SYS R9, [UR4+-0xc] ; | |
LDG.E.SYS R15, [R6+-0x1c] ; | |
LDG.E.SYS R17, [R6+-0x18] ; | |
LDG.E.SYS R19, [R6+-0x14] ; | |
LDG.E.SYS R21, [R6+-0x10] ; | |
LDG.E.SYS R23, [R6+-0xc] ; | |
LDG.E.SYS R25, [R6+-0x8] ; | |
LDG.E.SYS R27, [R6+-0x4] ; | |
IADD3 R8, P0, R0, -0x1, RZ ; | |
IADD3.X R14, R2, -0x1, RZ, P0, !PT ; | |
ISETP.NE.U32.AND P0, PT, R8, RZ, PT ; | |
ISETP.NE.AND.EX P0, PT, R14, RZ, PT, P0 ; | |
IMAD.MOV.U32 R6, RZ, RZ, R10 ; | |
MOV R7, R29 ; | |
IADD3 R0, P2, R31, R4, RZ ; | |
IMAD.X R2, R12, 0x1, R33, P2 ; | |
STG.E.SYS [R6+-0xc], R11 ; | |
STG.E.SYS [R6+-0x8], R13 ; | |
STG.E.SYS [R6+-0x14], R3 ; | |
STG.E.SYS [R6+-0x10], R9 ; | |
STG.E.SYS [R6+-0x4], R15 ; | |
STG.E.SYS [R6], R17 ; | |
STG.E.SYS [R6+0x4], R19 ; | |
STG.E.SYS [R6+0x8], R21 ; | |
STG.E.SYS [R6+0xc], R23 ; | |
STG.E.SYS [R6+0x10], R25 ; | |
STG.E.SYS [R6+0x14], R27 ; | |
@!P0 EXIT ; | |
IMAD.MOV.U32 R31, RZ, RZ, R0 ; | |
IMAD.MOV.U32 R12, RZ, RZ, R2 ; | |
IMAD.WIDE.U32 R6, R4, 0x2c, R6 ; | |
ISETP.GT.U32.AND P0, PT, R31, c[0x0][0x160], PT ; | |
IMAD R29, R33, 0x2c, RZ ; | |
ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x164], PT, P0 ; | |
IMAD.MOV.U32 R0, RZ, RZ, R8 ; | |
MOV R10, R6 ; | |
IMAD.IADD R29, R29, 0x1, R7 ; | |
IMAD.MOV.U32 R2, RZ, RZ, R14 ; | |
@!P0 BRA `(.L_5) ; | |
BSYNC B0 ; | |
.L_4: | |
EXIT ; | |
.L_6: | |
BRA `(.L_6); | |
.L_27: | |
#+END_EXAMPLE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment