Last active
July 22, 2023 21:40
-
-
Save mayerrobert/913b4c26103c614f9517360a4f00286a to your computer and use it in GitHub Desktop.
Matrix multiplication optimization experiments with SB-SIMD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;;; Matrix multiplication optimization experiments | |
;;; | |
;;; This file contains functions to multiply two matrices written in Common Lisp. | |
;;; | |
;;; DISCLAIMER: while this code has some tests it might contains bugs, | |
;;; I made it to experiment with SBCL's SIMD support. | |
;;; | |
;;; All functions use the naive nested loop algorithm which is O(n^3). | |
;;; This file contains 5 functions with various optimizations | |
;;; that should be portable Common Lisp and 5 functions that use sbcl's SB-SIMD. | |
;;; (the last SIMD function 'matrix-multiply-simd-unrolled-blocked2' is not | |
;;; that useful, though, leaving 4 SIMD functions) | |
;;; | |
;;; All 10 functions should produce the same result, the multiplication of two matrices. | |
;;; | |
;;; Execution times with sbcl-2.3.3 for multiplying | |
;;; two single-float (1000x1000) x (1000x1000) matrices range from: | |
;;; | |
;;; - 1.6 seconds for 'matrix-multiply' through | |
;;; - 0.73 seconds for 'matrix-multiply-unrolled-blocked' (fastest portable version) down to | |
;;; - 0.125 seconds for 'matrix-multiply-simd-unrolled-blocked'. | |
;;; | |
;;; and for multiplying two single-float (5000x2000) x (2000x5000) matrices range from: | |
;;; | |
;;; - 85 seconds for 'matrix-multiply' through | |
;;; - 39 seconds for 'matrix-multiply-unrolled-blocked' (fastest portable version) down to | |
;;; - 6.4 seconds for 'matrix-multiply-simd-unrolled-blocked'. | |
;;; | |
;;; Optimizations include loop unrolling, cache blocking and SIMD instructions. | |
;;; | |
;;; | |
;;; a is an m x p matrix, b is an p x n matrix, result is an m x n matrix. | |
;;; | |
;;; matrix-multiplication ::= for k=1..p, i=1..m, j=1..n | |
;;; result(i,j) := sum(a(i,k) * b(k,j)) | |
;;; | |
;;; | |
;;; Run with | |
;;; sbcl --script mmult-simd.lisp | |
;;; | |
;;; To use all provided functions you need sbcl-x64 2.2.6+ | |
;;; and a CPU that supports the selected instruction set. | |
;;; | |
;;; Other Common Lisp implementations will only be able to use the 5 portable functions. | |
;;; | |
;;; E.g. on abcl run with: | |
;;; C:\> abcl --batch --eval "(compile-file \"mmult-simd.lisp\")" | |
;;; C:\> abcl --batch --load mmult-simd.abcl | |
;;; Using iblock=16, jblock=1536, iblock x jblock x 4 x 3=288k | |
;;; MATRIX-MULTIPLY-UNROLLED-BLOCKED (1000x1000) by (1000x1000) | |
;;; 12.501 seconds real time | |
;;; 4000002 cons cells | |
;; comment out the next line for older sbcl versions w/o SB-SIMD | |
#+(and sbcl :X86-64) (push :use-simd *features*) | |
;; comment out the next line for older CPUs to use SSE | |
(push :avx256 *features*) | |
(declaim (optimize (speed 3) (safety 0) | |
)) | |
#+use-simd | |
(require "sb-simd") | |
(deftype array-index () | |
`(integer 0 (,array-dimension-limit))) | |
(deftype array-element () | |
`(single-float)) | |
;; Use SSE instructions | |
#+(and :use-simd (not :avx256)) | |
(progn | |
(defconstant +simd-size+ (coerce 4 'array-index)) | |
(deftype simd-vector () | |
'(sb-simd-sse:f32.4)) | |
(defmacro simd-vector (&rest args) | |
`(sb-simd-sse:f32.4 ,@args)) | |
(defmacro simd-+ (vec val) | |
`(sb-simd-sse:f32.4+ ,vec ,val)) | |
(defmacro simd-* (vec val) | |
`(sb-simd-sse:f32.4* ,vec ,val)) | |
(defmacro simd-row-major-aref (vec &rest subscripts) | |
`(sb-simd-sse:f32.4-row-major-aref ,vec ,@subscripts)) | |
(format t "Using SSE~%") | |
) | |
;; Use AVX256 | |
#+(and :use-simd :avx256) | |
(progn | |
(defconstant +simd-size+ (coerce 8 'fixnum)) | |
(deftype simd-vector () | |
'(sb-simd-avx:f32.8)) | |
(defmacro simd-vector (&rest args) | |
`(sb-simd-avx:f32.8 ,@args)) | |
(defmacro simd-+ (vec val) | |
`(sb-simd-avx:f32.8+ ,vec ,val)) | |
(defmacro simd-* (vec val) | |
`(sb-simd-avx:f32.8* ,vec ,val)) | |
(defmacro simd-row-major-aref (vec &rest subscripts) | |
`(sb-simd-avx:f32.8-row-major-aref ,vec ,@subscripts)) | |
(format t "Using AVX-256~%") | |
) | |
;; the blocksizes are tailored towards a 64k L1 cache, | |
;; and the blocksizes x 4 should probably be multiples of 64-byte cachelines | |
;; (4 bytes for single floats) | |
(defconstant +iblock-size+ 16) | |
(defconstant +jblock-size+ 256) | |
(defconstant +kblock-size+ 16) ; this is only used for matrix-multiply-simd-unrolled-blocked2 which is slower | |
(format t "Using iblock=~d, jblock=~d, iblock x jblock x 4 x 3=~dk~%" | |
+iblock-size+ +jblock-size+ (truncate (* +iblock-size+ +jblock-size+ 4 3) 1024)) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; various implementations of matrix multiplication | |
;; use loop order k/i/j so that the hot loop will read consecutive memory locations | |
(defun matrix-multiply (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(dotimes (i m) | |
(declare (array-index i)) | |
(dotimes (j n) | |
(declare (array-index j)) | |
(incf (aref result i j) (* (aref a i k) (aref b k j)))))) | |
result)) | |
;; simple optimizations: | |
;; lift (aref a i k) out of the hot loop | |
;; calculate addresses by incrementing the indices | |
(defun matrix-multiply-opt (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(dotimes (i m) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
;(dotimes (j n) | |
(loop repeat n do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))))) | |
result)) | |
;; similar to matrix-multiply-opt plus unroll the hot loop by 4 | |
(defun matrix-multiply-unrolled (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(dotimes (i m) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j 0 (+ j 4))) | |
((> j (- n 4)) | |
;(do ((j j (1+ j))) | |
; ((>= j n)) | |
(loop repeat (- n j) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf (row-major-aref result (+ result-idx 1)) (* tmp (row-major-aref b (+ b-idx 1)))) | |
(incf (row-major-aref result (+ result-idx 2)) (* tmp (row-major-aref b (+ b-idx 2)))) | |
(incf (row-major-aref result (+ result-idx 3)) (* tmp (row-major-aref b (+ b-idx 3)))) | |
(incf result-idx 4) | |
(incf b-idx 4))))) | |
result)) | |
;; similar to matrix-multiply-opt but with cache blocking | |
(defun matrix-multiply-blocked (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(do ((iblock 0 (+ iblock +iblock-size+))) | |
((>= iblock m)) | |
(let ((ilimit (min m (+ iblock +iblock-size+)))) | |
(do ((jblock 0 (+ jblock +jblock-size+))) | |
((>= jblock n)) | |
(let ((jlimit (min n (+ jblock +jblock-size+)))) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(do ((i iblock (1+ i))) | |
((>= i ilimit)) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
;(do ((j jblock (1+ j))) | |
; ((>= j (min n (+ jblock +jblock-size+)))) | |
(loop repeat (- jlimit jblock) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))))))))) | |
result)) | |
;; similar to matrix-multiply-opt but with loop unrolling and cache blocking | |
(defun matrix-multiply-unrolled-blocked (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(do ((iblock 0 (+ iblock +iblock-size+))) | |
((>= iblock m)) | |
(let ((ilimit (min m (+ iblock +iblock-size+)))) | |
(do ((jblock 0 (+ jblock +jblock-size+))) | |
((>= jblock n)) | |
(let ((jlimit (min n (+ jblock +jblock-size+)))) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(do ((i iblock (1+ i))) | |
((>= i ilimit)) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j jblock (+ j 4))) | |
((> j (- jlimit 4)) | |
;(do ((jj j (1+ jj))) | |
; ((>= jj jlimit)) | |
(loop repeat (- jlimit j) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf (row-major-aref result (+ result-idx 1)) (* tmp (row-major-aref b (+ b-idx 1)))) | |
(incf (row-major-aref result (+ result-idx 2)) (* tmp (row-major-aref b (+ b-idx 2)))) | |
(incf (row-major-aref result (+ result-idx 3)) (* tmp (row-major-aref b (+ b-idx 3)))) | |
(incf result-idx 4) | |
(incf b-idx 4))))))))) | |
result)) | |
;; similar to matrix-multiply-opt except: use SIMD | |
#+use-simd | |
(defun matrix-multiply-simd (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(dotimes (i m) | |
(declare (array-index i)) | |
(let ((vtmp (simd-vector (aref a i k))) | |
(tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (simd-vector vtmp) (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j 0 (+ j +simd-size+))) | |
((> j (- n +simd-size+)) | |
;(do ((j j (1+ j))) | |
; ((>= j n)) | |
(loop repeat (- n j) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(setf #1=(simd-row-major-aref result result-idx) | |
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) vtmp))) | |
(incf result-idx +simd-size+) | |
(incf b-idx +simd-size+))))) | |
result)) | |
;; similar to matrix-multiply-simd but with cache blocking | |
#+use-simd | |
(defun matrix-multiply-simd-blocked (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(do ((iblock 0 (+ iblock +iblock-size+))) | |
((>= iblock m)) | |
(let ((ilimit (min m (+ iblock +iblock-size+)))) | |
(do ((jblock 0 (+ jblock +jblock-size+))) | |
((>= jblock n)) | |
(let ((jlimit (min n (+ jblock +jblock-size+)))) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(do ((i iblock (1+ i))) | |
((>= i ilimit)) | |
(declare (array-index i)) | |
(let ((vtmp (simd-vector (aref a i k))) | |
(tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (simd-vector vtmp) (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j jblock (+ j +simd-size+))) | |
((> j (- jlimit +simd-size+)) | |
;(do ((j j (1+ j))) | |
; ((>= j jlimit)) | |
(loop repeat (- jlimit j) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(setf #1=(simd-row-major-aref result result-idx) | |
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) vtmp))) | |
(incf result-idx +simd-size+) | |
(incf b-idx +simd-size+))))))))) | |
result)) | |
;; similar to matrix-multiply-unrolled except: use SIMD | |
#+use-simd | |
(defun matrix-multiply-simd-unrolled (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(dotimes (i m) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j 0 (+ j (* 4 +simd-size+)))) | |
((> j (- n (* 4 +simd-size+))) | |
(do ((jj j (1+ jj))) | |
((>= jj n)) | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(setf #1=(simd-row-major-aref result result-idx) | |
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp))) | |
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx)) | |
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp))) | |
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx)) | |
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp))) | |
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx)) | |
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp))) | |
(incf result-idx (* 4 +simd-size+)) | |
(incf b-idx (* 4 +simd-size+)))))) | |
result)) | |
;; similar to matrix-multiply-unrolled-blocked except: use SIMD | |
#+use-simd | |
(defun matrix-multiply-simd-unrolled-blocked (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(do ((iblock 0 (+ iblock +iblock-size+))) | |
((>= iblock m)) | |
(let ((ilimit (min m (+ iblock +iblock-size+)))) | |
(do ((jblock 0 (+ jblock +jblock-size+))) | |
((>= jblock n)) | |
(let ((jlimit (min n (+ jblock +jblock-size+)))) | |
(dotimes (k p) | |
(declare (array-index k)) | |
(do ((i iblock (1+ i))) | |
((>= i ilimit)) | |
(declare (array-index i)) | |
(let (;(vtmp (simd-vector (aref a i k))) ;; xx | |
(tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare ;(simd-vector vtmp) | |
(array-element tmp) (array-index result-idx b-idx jlimit)) | |
(do ((j jblock (+ j (* 4 +simd-size+)))) | |
((> j (- jlimit (* 4 +simd-size+))) | |
(do ((jj j (+ jj +simd-size+))) | |
((> jj (- jlimit +simd-size+)) | |
;(do ((jjj jj (1+ jjj))) | |
; ((>= jjj jlimit)) | |
(loop repeat (- jlimit jj) do | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(setf #10=(simd-row-major-aref result result-idx) | |
(simd-+ #10# (simd-* (simd-row-major-aref b b-idx) tmp))) | |
(incf result-idx +simd-size+) | |
(incf b-idx +simd-size+))) | |
(setf #1=(simd-row-major-aref result result-idx) | |
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp))) | |
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx)) | |
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp))) | |
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx)) | |
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp))) | |
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx)) | |
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp))) | |
(incf result-idx (* 4 +simd-size+)) | |
(incf b-idx (* 4 +simd-size+)))))))))) | |
result)) | |
;; try a third level of blocking which is not really needed and makes things slower | |
#+use-simd | |
(defun matrix-multiply-simd-unrolled-blocked2 (a b) | |
"Performs matrix multiplication of two arrays." | |
(declare (type (simple-array array-element 2) a b)) | |
(assert (and (= (array-rank a) (array-rank b) 2) | |
(= (array-dimension a 1) (array-dimension b 0))) | |
(a b) | |
"Cannot multiply ~S by ~S." a b) | |
(let* ((m (array-dimension a 0)) | |
(n (array-dimension b 1)) | |
(p (array-dimension a 1)) | |
(result (make-array (list m n) :element-type 'array-element | |
:initial-element 0.0 :adjustable nil :fill-pointer nil))) | |
(declare (array-index m n p)) | |
(do ((kblock 0 (+ kblock +kblock-size+))) | |
((>= kblock p)) | |
(do ((iblock 0 (+ iblock +iblock-size+))) | |
((>= iblock m)) | |
(do ((jblock 0 (+ jblock +jblock-size+))) | |
((>= jblock n)) | |
(do ((k kblock (1+ k))) | |
((>= k (min p (+ kblock +kblock-size+)))) | |
(declare (array-index k)) | |
(do ((i iblock (1+ i))) | |
((>= i (min m (+ iblock +iblock-size+)))) | |
(declare (array-index i)) | |
(let ((tmp (aref a i k)) | |
(result-idx (array-row-major-index result i 0)) | |
(b-idx (array-row-major-index b k 0))) | |
(declare (array-element tmp) (array-index result-idx b-idx)) | |
(do ((j jblock (+ j (* 4 +simd-size+)))) | |
((> j (min (- n (* 4 +simd-size+)) (+ jblock +jblock-size+))) | |
(do ((jj j (1+ jj))) | |
((>= jj (min (- n +simd-size+) (+ jblock +jblock-size+))) | |
(do ((jjj jj (1+ jjj))) | |
((>= jjj (min n (+ jblock +jblock-size+)))) | |
(incf (row-major-aref result result-idx) (* tmp (row-major-aref b b-idx))) | |
(incf result-idx) | |
(incf b-idx))) | |
(setf #10=(simd-row-major-aref result result-idx) | |
(simd-+ #10# (simd-* (simd-row-major-aref b b-idx) tmp))) | |
(incf result-idx +simd-size+) | |
(incf b-idx +simd-size+))) | |
(setf #1=(simd-row-major-aref result result-idx) | |
(simd-+ #1# (simd-* (simd-row-major-aref b b-idx) tmp))) | |
(setf #2=(simd-row-major-aref result (+ +simd-size+ result-idx)) | |
(simd-+ #2# (simd-* (simd-row-major-aref b (+ +simd-size+ b-idx)) tmp))) | |
(setf #3=(simd-row-major-aref result (+ (* 2 +simd-size+) result-idx)) | |
(simd-+ #3# (simd-* (simd-row-major-aref b (+ (* 2 +simd-size+) b-idx)) tmp))) | |
(setf #4=(simd-row-major-aref result (+ (* 3 +simd-size+) result-idx)) | |
(simd-+ #4# (simd-* (simd-row-major-aref b (+ (* 3 +simd-size+) b-idx)) tmp))) | |
(incf result-idx (* 4 +simd-size+)) | |
(incf b-idx (* 4 +simd-size+))))))))) | |
result)) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; tests | |
;; comment the next line to run a basic test | |
#+(or) | |
(let ((x (matrix-multiply (make-array '(2 3) :element-type 'array-element | |
:initial-contents '((1.0 2.0 3.0) (4.0 5.0 6.0))) | |
(make-array '(3 2) :element-type 'array-element | |
:initial-contents '((5.0 6.0) (7.0 8.0) (9.0 10.0)))))) | |
;(print x) | |
(unless (equalp x #2A((46.0 52.0) (109.0 124.0))) | |
(error "d'oh"))) | |
(defun random-array (x y) | |
(let ((a (make-array (list x y) :element-type 'array-element :adjustable nil :fill-pointer nil))) | |
(dotimes (i x) | |
(dotimes (j y) | |
(setf (aref a i j) (- 10.0 (random 20.0))))) | |
a)) | |
(defun equal-ish (a b) | |
(let ((x (array-dimension a 0)) | |
(y (array-dimension a 1))) | |
(dotimes (i x) | |
(dotimes (j y) | |
(unless (= (aref a i j) (aref b i j)) | |
(return-from equal-ish nil))))) | |
(return-from equal-ish t)) | |
;; comment the next line to run tests | |
#+(or) | |
(dolist (f '(matrix-multiply | |
matrix-multiply-opt | |
matrix-multiply-unrolled | |
matrix-multiply-blocked | |
matrix-multiply-unrolled-blocked | |
#+use-simd matrix-multiply-simd | |
#+use-simd matrix-multiply-simd-unrolled | |
#+use-simd matrix-multiply-simd-blocked | |
#+use-simd matrix-multiply-simd-unrolled-blocked)) | |
(dolist (x '(15 16 17 #|1535 1536 1537|#)) | |
(dolist (y '(1535 1536 1537)) | |
;(format t "testing ~a (~dx~d) x (~dx~d)~%" f x y y x) | |
#1=(let* ((a (random-array x y)) | |
(b (random-array y x)) | |
(ref (matrix-multiply a b)) | |
(tst (funcall f a b))) | |
(unless (equal-ish ref tst) | |
(format t "ERROR: ~a (~dx~d) x (~dx~d) produces wrong result~%" f x y y x))))) | |
(dotimes (xx 33) | |
(dotimes (yy 65) | |
(let ((x (1+ xx)) | |
(y (1+ yy))) | |
#1#)))) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; timed runs | |
(defun arry (dimspec init) | |
"Create a simple-array and set its contents to 'init'." | |
(make-array dimspec | |
:element-type 'array-element | |
:initial-element init | |
:adjustable nil | |
:fill-pointer nil)) | |
(defun mmult (func m p &optional (n m)) | |
(format t "~a (~dx~d) by (~dx~d)~%" func m p p n) | |
(let ((a (arry (list m p) 1.0)) | |
(b (arry (list p n) 2.0))) | |
(time (funcall (symbol-function func) a b)))) | |
(mmult 'matrix-multiply 1000 1000) ; 1.566s | |
;(mmult 'matrix-multiply-opt 1000 1000) ; 0.786s | |
;(mmult 'matrix-multiply-unrolled 1000 1000) ; 0.714s | |
;(mmult 'matrix-multiply-blocked 1000 1000) ; 0.775s | |
;(mmult 'matrix-multiply-unrolled-blocked 1000 1000) ; 0.728s | |
#+use-simd | |
(progn | |
;(mmult 'matrix-multiply-simd 1000 1000) ; 0.172s | |
;(mmult 'matrix-multiply-simd-unrolled 1000 1000) ; 0.156s | |
;(mmult 'matrix-multiply-simd-blocked 1000 1000) ; 0.180s | |
(mmult 'matrix-multiply-simd-unrolled-blocked 1000 1000) ; 0.125s | |
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 1000 1000) | |
) | |
;(mmult 'matrix-multiply 5000 2000) ; 84.902s | |
;(mmult 'matrix-multiply-opt 5000 2000) ; 46.415s | |
;(mmult 'matrix-multiply-unrolled 5000 2000) ; 40.123s | |
;(mmult 'matrix-multiply-blocked 5000 2000) ; 42.022s | |
;(mmult 'matrix-multiply-unrolled-blocked 5000 2000) ; 38.968s | |
#+use-simd | |
(progn | |
;(mmult 'matrix-multiply-simd 5000 2000) ; 16.019s | |
;(mmult 'matrix-multiply-simd-unrolled 5000 2000) ; 15.519s | |
;(mmult 'matrix-multiply-simd-blocked 5000 2000) ; 9.618s | |
(mmult 'matrix-multiply-simd-unrolled-blocked 5000 2000) ; 6.358s | |
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 5000 2000) | |
) | |
;(mmult 'matrix-multiply 2000 5000) ; 36.055 | |
;(mmult 'matrix-multiply-opt 2000 5000) ; 17.820s | |
;(mmult 'matrix-multiply-unrolled 2000 5000) ; 16.162s | |
;(mmult 'matrix-multiply-blocked 2000 5000) ; 17.853s | |
;(mmult 'matrix-multiply-unrolled-blocked 2000 5000) ; 15.578s | |
#+use-simd | |
(progn | |
;(mmult 'matrix-multiply-simd 2000 5000) ; 6.409s | |
;(mmult 'matrix-multiply-simd-unrolled 2000 5000) ; 6.523s | |
;(mmult 'matrix-multiply-simd-blocked 2000 5000) ; 3.735s | |
;(mmult 'matrix-multiply-simd-unrolled-blocked 2000 5000) ; 2.966s | |
;;;(mmult 'matrix-multiply-simd-unrolled-blocked2 2000 5000) | |
) | |
;; time should increase with N^3 | |
;(dolist (x '(100 200 500 1000 2000 5000 10000)) (mmult 'matrix-multiply-simd-unrolled-blocked x 1000)) | |
;; time should increase with N | |
;(dolist (x '(100 200 500 1000 2000 5000 10000)) (mmult 'matrix-multiply-simd-unrolled-blocked 1000 x)) | |
;#+(and sbcl :X86-64) | |
;(require :sb-sprof) | |
; | |
;#+(and sbcl :X86-64) | |
;(let ((a (arry '(3000 1000) 1.0)) | |
; (b (arry '(1000 3000) 2.0))) | |
; | |
; (sb-sprof:with-profiling (:mode :cpu :loop nil :max-samples 1000 :report :flat :sample-interval 0.1) | |
; (matrix-multiply-unrolled-blocked a b))) | |
;(disassemble 'matrix-multiply-opt) | |
;(disassemble 'matrix-multiply-unrolled) | |
;(disassemble 'matrix-multiply-blocked) | |
;(disassemble 'matrix-multiply-unrolled-blocked) | |
;(disassemble 'matrix-multiply-simd-unrolled-blocked) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment