Keren Zhou Jokeren

💻

Work

407 followers · 25 following

George Mason University
Fairfax
21:24 (UTC -12:00)
jokeren.tech

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

Jokeren / wrong.llir

Created August 17, 2024 16:22

AMD vec problem

	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"
	target triple = "amdgcn-amd-amdhsa"

	@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16

	; Function Attrs: mustprogress nofree norecurse nounwind willreturn
	define amdgpu_kernel void @flip_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 !dbg !4 {
	%3 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
	%4 = shl i32 %3, 2, !dbg !8

Jokeren / new.mlir

Created August 9, 2024 02:39

mlirs

	#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
	#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
	#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
	#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}>
	#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
	module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} {
	tt.func public @hoist_convert_above_extf_and_remat(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.pt

Jokeren / overhead.py

Last active June 7, 2024 15:01

Proton overhead

	import torch
	import time
	import sys


	def run(nelems, iters):
	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)

Jokeren / gist:17554b58eed1157a43aaa85a5f1cfc7c

Created June 5, 2024 17:36

fp8_16x16

	"""
	Matrix Multiplication
	=====================
	In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
	performance on par with cuBLAS or rocBLAS.

	You will specifically learn about:

	* Block-level matrix multiplications.

Jokeren / add.py

Last active August 29, 2023 23:40

record function reproducer

	import torch
	import sys

	device = torch.device('cpu')
	left = torch.zeros(100, device=device, requires_grad=True)
	right = torch.zeros(100, device=device, requires_grad=True)
	grad = torch.zeros(100, device=device)

	for _ in range(10):
	output = torch.add(left, right)

Jokeren / test.py

Last active April 3, 2023 23:14

Triton MMA Reduce Reproducer

	import triton
	import pytest
	import torch
	import triton.language as tl
	import numpy as np
	from numpy.random import RandomState


	@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis",
	[(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis)

Jokeren / main.ptx

Last active April 4, 2023 23:20

PTX undefined behavior

	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.0
	.target sm_80
	.address_size 64

	// .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d
	.extern .shared .align 1 .b8 global_smem[];

Jokeren / Instruction.md

Created March 1, 2023 06:13

fp16 mov reproducer

Install

git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -y

Expected result (-0.1250)

Jokeren / ptx

Created February 28, 2023 03:29

bug.ptx

	//
	// Generated by LLVM NVPTX Back-End
	//
	.version 8.0
	.target sm_80
	.address_size 64
	// .globl triton__0d1d2d3d
	.visible .entry triton__0d1d2d3d(
	.param .u64 triton__0d1d2d3d_param_0,
	.param .u64 triton__0d1d2d3d_param_1,

Jokeren / main.cc

Last active August 16, 2021 04:58

ld_preload + api

	#include <dlfcn.h>
	#include "tool.h"

	int main() {
	//void *handle = dlopen("./tool.so", RTLD_NOW);
	print_t func = (print_t)dlsym(RTLD_NEXT, "print");
	func();
	return 0;
	}

NewerOlder