Skip to content

Instantly share code, notes, and snippets.

View minjang's full-sized avatar

Minjang Kim minjang

  • Facebook
  • Menlo Park, CA
View GitHub Profile
from typing import Optional, Union
import os
import numpy as np
import torch
import triton
import triton.language as tl
import math
from triton.runtime.jit import TensorWrapper, reinterpret
from numpy.random import RandomState
@minjang
minjang / patch.diff
Created May 23, 2024 01:10
A quick patch to make triton-cpu runnable for github.com/ienkovich/triton-cpu/tree/ienkovich/change-cast-test-size
diff --git a/include/triton/Conversion/CMakeLists.txt b/include/triton/Conversion/CMakeLists.txt
index ae31ac93..691104f3 100644
--- a/include/triton/Conversion/CMakeLists.txt
+++ b/include/triton/Conversion/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_subdirectory(TritonCPUToLLVM)
+# add_subdirectory(TritonCPUToLLVM)
add_subdirectory(TritonGPUToLLVM)
-add_subdirectory(TritonToTritonCPU)
+# add_subdirectory(TritonToTritonCPU)
@minjang
minjang / dlmalloc.c
Created December 10, 2016 19:43
Version 2.8.6 Wed Aug 29 06:57:58 201
/*
This is a version (aka dlmalloc) of malloc/free/realloc written by
Doug Lea and released to the public domain, as explained at
http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
comments, complaints, performance data, etc to dl@cs.oswego.edu
* Version 2.8.6 Wed Aug 29 06:57:58 2012 Doug Lea
Note: There may be an updated version of this malloc obtainable at
ftp://gee.cs.oswego.edu/pub/misc/malloc.c
Check before installing!
@minjang
minjang / swap-mem2reg-inline-instcombine.ll
Created November 29, 2016 05:36
Fully optimized code
define i32 @_Z4testv() #0 {
entry:
%a = call i32 @get()
%b = call i32 @get()
%b = call i32 @process(i32 %a, i32 %b)
ret i32 %b
}
%a = call i32 @get()
%b = call i32 @get()
%xor = xor i32 %b, %a
%xor1 = xor i32 %a, %xor
; => %xor1 = %a ^ %xor
; => %xor1 = %a ^ (%b ^ %a) ; a ^ (b ^ a) = b ^ 0
; => %xor1 = %b ^ 0 ; b ^ 0 = b
; => %xor1 = %b ; 이후 %xor1 사용처를 모두 %b로 바꿈
; => %xor1 삭제
@minjang
minjang / swap-mem2reg-inline-mem2reg.ll
Created November 29, 2016 03:17
After mem2reg, inline, and mem2reg optimizations
define i32 @_Z4testv() #0 {
entry:
%call = call i32 @_Z3getv() ; a = get();
%call1 = call i32 @_Z3getv() ; b = get();
; temp_swap(a, b)는 사라짐
%xor.i = xor i32 %call1, %call ; xor_swap(a, b)가 xor_swap(b, a)로 바뀜
%xor1.i = xor i32 %call, %xor.i
%xor2.i = xor i32 %xor.i, %xor1.i
%call2 = call i32 @_Z7processii(i32 %xor2.i, i32 %xor1.i)
ret i32 %call2
@minjang
minjang / swap-mem2reg-gvn.ll
Last active November 28, 2016 23:03
Applied mem2reg and gvn pass
define linkonce_odr void @void temp_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
entry: ; %0
%0 = load i32, i32* %a, align 4 ; W %1
%1 = load i32, i32* %b, align 4 ; | W
store i32 %1, i32* %a, align 4 ; R |
store i32 %0, i32* %b, align 4 ; R
ret void
}
define linkonce_odr void @void xor_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
@minjang
minjang / swap-mem2reg-gvn-inline.ll
Last active November 29, 2016 01:00
After mem2reg, gvn, and inline optimizations
define i32 @_Z4testv() #0 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%call = call i32 @_Z3getv() ; int a = get();
store i32 %call, i32* %a, align 4
%call1 = call i32 @_Z3getv() ; int b = get();
store i32 %call1, i32* %b, align 4
%0 = load i32, i32* %a, align 4 ; temp_swap(a, b);
%1 = load i32, i32* %b, align 4
@minjang
minjang / gist:89ee4cd6a040dfda0d7dc23603b3c8c3
Created November 28, 2016 10:41
LLVM -O3 optimization passes
$ ./opt -O3 -debug-pass=Structure -o swap.opt.bc swap.bc
Pass Arguments: -tti -tbaa -scoped-noalias -assumption-cache-tracker -targetlibinfo -verify -simplifycfg -domtree -sroa -early-cse -basicaa -aa -memdep -memoryssa -gvn-hoist -lower-expect
Target Transform Information
Type-Based Alias Analysis
Scoped NoAlias Alias Analysis
Assumption Cache Tracker
Target Library Information
FunctionPass Manager
Module Verifier
Simplify the CFG
@minjang
minjang / swap-only-mem2reg.ll
Last active November 28, 2016 19:42
Applied only mem2reg pass
define linkonce_odr void @void temp_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {
entry: ; %0
%0 = load i32, i32* %a, align 4 ; W %1
%1 = load i32, i32* %b, align 4 ; | W
store i32 %1, i32* %a, align 4 ; R |
store i32 %0, i32* %b, align 4 ; R
ret void
}
define linkonce_odr void @void xor_swap<int>(int&, int&)(i32* %a, i32* %b) #2 {