minjang · May 23, 2024 01:10
diff --git a/patch.diff b/patch.diff
 diff --git a/include/triton/Conversion/CMakeLists.txt b/include/triton/Conversion/CMakeLists.txt
 index ae31ac93..691104f3 100644
 --- a/include/triton/Conversion/CMakeLists.txt
 +++ b/include/triton/Conversion/CMakeLists.txt
 @@ -1,4 +1,4 @@
 -add_subdirectory(TritonCPUToLLVM)
 +# add_subdirectory(TritonCPUToLLVM)
 add_subdirectory(TritonGPUToLLVM)
 -add_subdirectory(TritonToTritonCPU)
 +# add_subdirectory(TritonToTritonCPU)
 add_subdirectory(TritonToTritonGPU)
 diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
 index 83db4ae4..1998a41a 100644
 --- a/lib/Conversion/CMakeLists.txt
 +++ b/lib/Conversion/CMakeLists.txt
 @@ -1,4 +1,4 @@
 -#add_subdirectory(TritonToTritonCPU)
 +# add_subdirectory(TritonToTritonCPU)
 add_subdirectory(TritonToTritonGPU)
 -#add_subdirectory(TritonCPUToLLVM)
 +# add_subdirectory(TritonCPUToLLVM)
 add_subdirectory(TritonGPUToLLVM)
 diff --git a/python/src/passes.cc b/python/src/passes.cc
 index df7d9faa..716f2623 100644
 --- a/python/src/passes.cc
 +++ b/python/src/passes.cc
 @@ -6,7 +6,7 @@
 #include "triton/Analysis/Allocation.h"
 #include "triton/Analysis/Membar.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 -#include "triton/Conversion/TritonToTritonCPU/Passes.h"
 +// #include "triton/Conversion/TritonToTritonCPU/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
 index 6eb7aa29..7684d7f1 100644
 --- a/python/triton/compiler/compiler.py
 +++ b/python/triton/compiler/compiler.py
 @@ -224,6 +224,7 @@ def filter_traceback(e: BaseException):
 
 
 def compile(src, target=None, options=None):
 +    print(f"Compiling from triton-cpu-ienkovich...")
     if target is None:
         target = driver.active.get_current_target()
     assert isinstance(target, GPUTarget), "target must be of GPUTarget type"
 diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
 index d7baeb28..bf9c740a 100644
 --- a/python/triton/runtime/build.py
 +++ b/python/triton/runtime/build.py
 @@ -18,18 +18,19 @@ def quiet():
         sys.stdout, sys.stderr = old_stdout, old_stderr
 
 
 -def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
 +def _build(name, src, srcdir, library_dirs, include_dirs, libraries, cxx=False):
     suffix = sysconfig.get_config_var('EXT_SUFFIX')
     so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
     # try to avoid setuptools if possible
 -    cc = os.environ.get("CC")
 +    cc = os.environ.get("CC" if not cxx else "CXX")
     if cc is None:
         # TODO: support more things here.
 -        clang = shutil.which("clang")
 -        gcc = shutil.which("gcc")
 +        clang = shutil.which("clang" if not cxx else "clang++")
 +        gcc = shutil.which("gcc" if not cxx else "g++")
         cc = gcc if gcc is not None else clang
         if cc is None:
 -            raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.")
 +            raise RuntimeError("Failed to find C or C++ compiler. Please specify via CC environment variable.")
 +
     # This function was renamed and made public in Python 3.10
     if hasattr(sysconfig, 'get_default_scheme'):
         scheme = sysconfig.get_default_scheme()
 diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
 index 1c1900a0..207ec22c 100644
 --- a/python/tutorials/01-vector-add.py
 +++ b/python/tutorials/01-vector-add.py
 @@ -23,6 +23,39 @@ import torch
 import triton
 import triton.language as tl
 
 +@triton.jit
 +def add_kernel_block_ptr(in1_ptr, in2_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
 +    id = tl.program_id(0)
 +    offs = id * BLOCK_SIZE
 +    in1_block_ptr = tl.make_block_ptr(
 +        base=in1_ptr,
 +        shape=(BLOCK_SIZE,),
 +        strides=(1,),
 +        offsets=(offs,),
 +        block_shape=(BLOCK_SIZE,),
 +        order=(0,),
 +    )
 +    in2_block_ptr = tl.make_block_ptr(
 +        base=in2_ptr,
 +        shape=(BLOCK_SIZE,),
 +        strides=(1,),
 +        offsets=(offs,),
 +        block_shape=(BLOCK_SIZE,),
 +        order=(0,),
 +    )
 +    out_block_ptr = tl.make_block_ptr(
 +        base=out_ptr,
 +        shape=(BLOCK_SIZE,),
 +        strides=(1,),
 +        offsets=(offs,),
 +        block_shape=(BLOCK_SIZE,),
 +        order=(0,),
 +    )
 +    val1 = tl.load(in1_block_ptr, boundary_check=())
 +    val2 = tl.load(in2_block_ptr, boundary_check=())
 +    val = val1 + val2
 +    tl.store(out_block_ptr, val, boundary_check=())
 +
 
 @triton.jit
 def add_kernel(x_ptr,  # *Pointer* to first input vector.
 @@ -42,14 +75,14 @@ def add_kernel(x_ptr,  # *Pointer* to first input vector.
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     # Create a mask to guard memory operations against out-of-bounds accesses.
 -    mask = offsets < n_elements
 +    # mask = offsets < n_elements
     # Load x and y from DRAM, masking out any extra elements in case the input is not a
     # multiple of the block size.
 -    x = tl.load(x_ptr + offsets, mask=mask)
 -    y = tl.load(y_ptr + offsets, mask=mask)
 +    x = tl.load(x_ptr + offsets)
 +    y = tl.load(y_ptr + offsets)
     output = x + y
     # Write x + y back to DRAM.
 -    tl.store(output_ptr + offsets, output, mask=mask)
 +    tl.store(output_ptr + offsets, output)
 
 
 # %%
 @@ -57,10 +90,10 @@ def add_kernel(x_ptr,  # *Pointer* to first input vector.
 # and (2) enqueue the above kernel with appropriate grid/block sizes:
 
 
 -def add(x: torch.Tensor, y: torch.Tensor):
 +def add_triton_cpu(x: torch.Tensor, y: torch.Tensor, use_block_ptr):
     # We need to preallocate the output.
     output = torch.empty_like(x)
 -    assert x.is_cuda and y.is_cuda and output.is_cuda
 +    #assert not x.is_cuda and not y.is_cuda and not output.is_cuda
     n_elements = output.numel()
     # The SPMD launch grid denotes the number of kernel instances that run in parallel.
     # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
 @@ -70,7 +103,10 @@ def add(x: torch.Tensor, y: torch.Tensor):
     #  - Each torch.tensor object is implicitly converted into a pointer to its first element.
     #  - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
     #  - Don't forget to pass meta-parameters as keywords arguments.
 -    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
 +    if use_block_ptr:
 +        add_kernel_block_ptr[grid](x, y, output, BLOCK_SIZE=128)
 +    else:
 +        add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=128)
     # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
     # running asynchronously at this point.
     return output
 @@ -80,12 +116,16 @@ def add(x: torch.Tensor, y: torch.Tensor):
 # We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
 
 torch.manual_seed(0)
 -size = 98432
 -x = torch.rand(size, device='cuda')
 -y = torch.rand(size, device='cuda')
 +size = 128
 +x = torch.rand(size, device='cpu')
 +y = torch.rand(size, device='cpu')
 output_torch = x + y
 -output_triton = add(x, y)
 print(output_torch)
 +output_triton = add_triton_cpu(x, y, True)
 +print(output_triton)
 +print(f'The maximum difference between torch and triton_block_ptr is '
 +      f'{torch.max(torch.abs(output_torch - output_triton))}')
 +output_triton = add_triton_cpu(x, y, False)
 print(output_triton)
 print(f'The maximum difference between torch and triton is '
       f'{torch.max(torch.abs(output_torch - output_triton))}')
 @@ -130,4 +170,4 @@ def benchmark(size, provider):
 # %%
 # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 # `save_path='/path/to/results/' to save them to disk along with raw CSV data:
 -benchmark.run(print_data=True, show_plots=True)
 +# benchmark.run(print_data=True, show_plots=True)
 diff --git a/third_party/cpu/backend/compiler.py b/third_party/cpu/backend/compiler.py
 index 357b5f44..13a2d006 100644
 --- a/third_party/cpu/backend/compiler.py
 +++ b/third_party/cpu/backend/compiler.py
 @@ -137,8 +137,10 @@ class CPUBackend(BaseBackend):
     @staticmethod
     def make_bc(src, metadata, options):
         if os.environ.get("TRITON_CPU_ASM_DUMP", "0") == "1":
 -            print("********** Module ASM **********")
 -            print(llvm.translate_to_host_asm(src))
 +            file_name = f"{metadata['name']}.asm"
 +            print(f"********** Writing .asm to {file_name} **********")
 +            with open(file_name, "w") as f:
 +                f.write(llvm.translate_to_host_asm(src))
         ret = llvm.translate_to_bc(src)
         return ret
 
 diff --git a/third_party/cpu/backend/driver.py b/third_party/cpu/backend/driver.py
 index 743684d2..14216725 100644
 --- a/third_party/cpu/backend/driver.py
 +++ b/third_party/cpu/backend/driver.py
 @@ -74,6 +74,7 @@ libraries = [
     "LLVMSupport",
     "LLVMDemangle",
     "stdc++",
 +    "z",
 ]
 
 
 @@ -86,7 +87,7 @@ def compile_module_from_src(src, name):
             src_path = os.path.join(tmpdir, "main.cpp")
             with open(src_path, "w") as f:
                 f.write(src)
 -            so = _build(name, src_path, tmpdir, library_dir, include_dir, libraries)
 +            so = _build(name, src_path, tmpdir, library_dir, include_dir, libraries, True)
             with open(so, "rb") as f:
                 cache_path = cache.put(f.read(), f"{name}.so", binary=True)
     import importlib.util
 diff --git a/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt b/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
 index 64b36523..0936dff1 100644
 --- a/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
 +++ b/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
 @@ -1,3 +1,3 @@
 set(LLVM_TARGET_DEFINITIONS Passes.td)
 mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM)
 -add_public_tablegen_target(TritonCPUConversionPassIncGen)
 +add_public_tablegen_target(TritonCPUToLLVMConversionPassIncGen)
	diff --git a/include/triton/Conversion/CMakeLists.txt b/include/triton/Conversion/CMakeLists.txt
	index ae31ac93..691104f3 100644
	--- a/include/triton/Conversion/CMakeLists.txt
	+++ b/include/triton/Conversion/CMakeLists.txt
	@@ -1,4 +1,4 @@
	-add_subdirectory(TritonCPUToLLVM)
	+# add_subdirectory(TritonCPUToLLVM)
	add_subdirectory(TritonGPUToLLVM)
	-add_subdirectory(TritonToTritonCPU)
	+# add_subdirectory(TritonToTritonCPU)
	add_subdirectory(TritonToTritonGPU)
	diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
	index 83db4ae4..1998a41a 100644
	--- a/lib/Conversion/CMakeLists.txt
	+++ b/lib/Conversion/CMakeLists.txt
	@@ -1,4 +1,4 @@
	-#add_subdirectory(TritonToTritonCPU)
	+# add_subdirectory(TritonToTritonCPU)
	add_subdirectory(TritonToTritonGPU)
	-#add_subdirectory(TritonCPUToLLVM)
	+# add_subdirectory(TritonCPUToLLVM)
	add_subdirectory(TritonGPUToLLVM)
	diff --git a/python/src/passes.cc b/python/src/passes.cc
	index df7d9faa..716f2623 100644
	--- a/python/src/passes.cc
	+++ b/python/src/passes.cc
	@@ -6,7 +6,7 @@
	#include "triton/Analysis/Allocation.h"
	#include "triton/Analysis/Membar.h"
	#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
	-#include "triton/Conversion/TritonToTritonCPU/Passes.h"
	+// #include "triton/Conversion/TritonToTritonCPU/Passes.h"
	#include "triton/Conversion/TritonToTritonGPU/Passes.h"
	#include "triton/Dialect/Triton/Transforms/Passes.h"
	#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
	diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
	index 6eb7aa29..7684d7f1 100644
	--- a/python/triton/compiler/compiler.py
	+++ b/python/triton/compiler/compiler.py
	@@ -224,6 +224,7 @@ def filter_traceback(e: BaseException):


	def compile(src, target=None, options=None):
	+ print(f"Compiling from triton-cpu-ienkovich...")
	if target is None:
	target = driver.active.get_current_target()
	assert isinstance(target, GPUTarget), "target must be of GPUTarget type"
	diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
	index d7baeb28..bf9c740a 100644
	--- a/python/triton/runtime/build.py
	+++ b/python/triton/runtime/build.py
	@@ -18,18 +18,19 @@ def quiet():
	sys.stdout, sys.stderr = old_stdout, old_stderr


	-def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
	+def _build(name, src, srcdir, library_dirs, include_dirs, libraries, cxx=False):
	suffix = sysconfig.get_config_var('EXT_SUFFIX')
	so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
	# try to avoid setuptools if possible
	- cc = os.environ.get("CC")
	+ cc = os.environ.get("CC" if not cxx else "CXX")
	if cc is None:
	# TODO: support more things here.
	- clang = shutil.which("clang")
	- gcc = shutil.which("gcc")
	+ clang = shutil.which("clang" if not cxx else "clang++")
	+ gcc = shutil.which("gcc" if not cxx else "g++")
	cc = gcc if gcc is not None else clang
	if cc is None:
	- raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.")
	+ raise RuntimeError("Failed to find C or C++ compiler. Please specify via CC environment variable.")
	+
	# This function was renamed and made public in Python 3.10
	if hasattr(sysconfig, 'get_default_scheme'):
	scheme = sysconfig.get_default_scheme()
	diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
	index 1c1900a0..207ec22c 100644
	--- a/python/tutorials/01-vector-add.py
	+++ b/python/tutorials/01-vector-add.py
	@@ -23,6 +23,39 @@ import torch
	import triton
	import triton.language as tl

	+@triton.jit
	+def add_kernel_block_ptr(in1_ptr, in2_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
	+ id = tl.program_id(0)
	+ offs = id * BLOCK_SIZE
	+ in1_block_ptr = tl.make_block_ptr(
	+ base=in1_ptr,
	+ shape=(BLOCK_SIZE,),
	+ strides=(1,),
	+ offsets=(offs,),
	+ block_shape=(BLOCK_SIZE,),
	+ order=(0,),
	+ )
	+ in2_block_ptr = tl.make_block_ptr(
	+ base=in2_ptr,
	+ shape=(BLOCK_SIZE,),
	+ strides=(1,),
	+ offsets=(offs,),
	+ block_shape=(BLOCK_SIZE,),
	+ order=(0,),
	+ )
	+ out_block_ptr = tl.make_block_ptr(
	+ base=out_ptr,
	+ shape=(BLOCK_SIZE,),
	+ strides=(1,),
	+ offsets=(offs,),
	+ block_shape=(BLOCK_SIZE,),
	+ order=(0,),
	+ )
	+ val1 = tl.load(in1_block_ptr, boundary_check=())
	+ val2 = tl.load(in2_block_ptr, boundary_check=())
	+ val = val1 + val2
	+ tl.store(out_block_ptr, val, boundary_check=())
	+

	@triton.jit
	def add_kernel(x_ptr, # Pointer to first input vector.
	@@ -42,14 +75,14 @@ def add_kernel(x_ptr, # Pointer to first input vector.
	block_start = pid * BLOCK_SIZE
	offsets = block_start + tl.arange(0, BLOCK_SIZE)
	# Create a mask to guard memory operations against out-of-bounds accesses.
	- mask = offsets < n_elements
	+ # mask = offsets < n_elements
	# Load x and y from DRAM, masking out any extra elements in case the input is not a
	# multiple of the block size.
	- x = tl.load(x_ptr + offsets, mask=mask)
	- y = tl.load(y_ptr + offsets, mask=mask)
	+ x = tl.load(x_ptr + offsets)
	+ y = tl.load(y_ptr + offsets)
	output = x + y
	# Write x + y back to DRAM.
	- tl.store(output_ptr + offsets, output, mask=mask)
	+ tl.store(output_ptr + offsets, output)


	# %%
	@@ -57,10 +90,10 @@ def add_kernel(x_ptr, # Pointer to first input vector.
	# and (2) enqueue the above kernel with appropriate grid/block sizes:


	-def add(x: torch.Tensor, y: torch.Tensor):
	+def add_triton_cpu(x: torch.Tensor, y: torch.Tensor, use_block_ptr):
	# We need to preallocate the output.
	output = torch.empty_like(x)
	- assert x.is_cuda and y.is_cuda and output.is_cuda
	+ #assert not x.is_cuda and not y.is_cuda and not output.is_cuda
	n_elements = output.numel()
	# The SPMD launch grid denotes the number of kernel instances that run in parallel.
	# It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
	@@ -70,7 +103,10 @@ def add(x: torch.Tensor, y: torch.Tensor):
	# - Each torch.tensor object is implicitly converted into a pointer to its first element.
	# - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
	# - Don't forget to pass meta-parameters as keywords arguments.
	- add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
	+ if use_block_ptr:
	+ add_kernel_block_ptr[grid](x, y, output, BLOCK_SIZE=128)
	+ else:
	+ add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=128)
	# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
	# running asynchronously at this point.
	return output
	@@ -80,12 +116,16 @@ def add(x: torch.Tensor, y: torch.Tensor):
	# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:

	torch.manual_seed(0)
	-size = 98432
	-x = torch.rand(size, device='cuda')
	-y = torch.rand(size, device='cuda')
	+size = 128
	+x = torch.rand(size, device='cpu')
	+y = torch.rand(size, device='cpu')
	output_torch = x + y
	-output_triton = add(x, y)
	print(output_torch)
	+output_triton = add_triton_cpu(x, y, True)
	+print(output_triton)
	+print(f'The maximum difference between torch and triton_block_ptr is '
	+ f'{torch.max(torch.abs(output_torch - output_triton))}')
	+output_triton = add_triton_cpu(x, y, False)
	print(output_triton)
	print(f'The maximum difference between torch and triton is '
	f'{torch.max(torch.abs(output_torch - output_triton))}')
	@@ -130,4 +170,4 @@ def benchmark(size, provider):
	# %%
	# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
	# `save_path='/path/to/results/' to save them to disk along with raw CSV data:
	-benchmark.run(print_data=True, show_plots=True)
	+# benchmark.run(print_data=True, show_plots=True)
	diff --git a/third_party/cpu/backend/compiler.py b/third_party/cpu/backend/compiler.py
	index 357b5f44..13a2d006 100644
	--- a/third_party/cpu/backend/compiler.py
	+++ b/third_party/cpu/backend/compiler.py
	@@ -137,8 +137,10 @@ class CPUBackend(BaseBackend):
	@staticmethod
	def make_bc(src, metadata, options):
	if os.environ.get("TRITON_CPU_ASM_DUMP", "0") == "1":
	- print("******** Module ASM ********")
	- print(llvm.translate_to_host_asm(src))
	+ file_name = f"{metadata['name']}.asm"
	+ print(f"******** Writing .asm to {file_name} ********")
	+ with open(file_name, "w") as f:
	+ f.write(llvm.translate_to_host_asm(src))
	ret = llvm.translate_to_bc(src)
	return ret

	diff --git a/third_party/cpu/backend/driver.py b/third_party/cpu/backend/driver.py
	index 743684d2..14216725 100644
	--- a/third_party/cpu/backend/driver.py
	+++ b/third_party/cpu/backend/driver.py
	@@ -74,6 +74,7 @@ libraries = [
	"LLVMSupport",
	"LLVMDemangle",
	"stdc++",
	+ "z",
	]


	@@ -86,7 +87,7 @@ def compile_module_from_src(src, name):
	src_path = os.path.join(tmpdir, "main.cpp")
	with open(src_path, "w") as f:
	f.write(src)
	- so = _build(name, src_path, tmpdir, library_dir, include_dir, libraries)
	+ so = _build(name, src_path, tmpdir, library_dir, include_dir, libraries, True)
	with open(so, "rb") as f:
	cache_path = cache.put(f.read(), f"{name}.so", binary=True)
	import importlib.util
	diff --git a/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt b/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
	index 64b36523..0936dff1 100644
	--- a/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
	+++ b/third_party/cpu/include/TritonCPUToLLVM/CMakeLists.txt
	@@ -1,3 +1,3 @@
	set(LLVM_TARGET_DEFINITIONS Passes.td)
	mlir_tablegen(Passes.h.inc -gen-pass-decls --name TritonCPUToLLVM)
	-add_public_tablegen_target(TritonCPUConversionPassIncGen)
	+add_public_tablegen_target(TritonCPUToLLVMConversionPassIncGen)