cyyynthia · March 31, 2023 14:19
diff --git a/llama_measure_perf.mjs b/llama_measure_perf.mjs
 /*!
 * BSD Zero Clause License
 * Copyright (c) Cynthia Rey
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
 */

 import { spawn, execSync } from 'child_process'
 import { open } from 'fs/promises'

 const LLAMA_PARAMETERS = [
 	'-t', '4',
 	'-p', 'Here is a long story about how programming came to be:',
 	'-n', '256',
 	'-c', '1024',
 	'--top_k', '40',
 	'--top_p', '0.95',
 	'--repeat_last_n', '64',
 	'--repeat_penalty', '1.1',
 ]

 const LLAMA_START_AT = '2d64715ad475f192a4004a52d134c67ccb6f44ad'

 const LLAMA_GGMF_SINCE = '074bea2eb1f1349a0118239c4152914aecaa1be4'
 const LLAMA_GGJT_SINCE = '78ca9838ee36660a776e97e3391b6fb5dcaacf7f'

 const IMPORTANT_FILES = [
 	'ggml.c', 'ggml.h',
 	'llama.cpp', 'llama.h',
 	'examples/common.cpp', 'examples/common.h',
 	'examples/main/main.cpp',
 ]

 const FORMATS = {
 	unversioned: './models/7B/ggml-model-q4_0.unversioned.bin',
 	ggmf: './models/7B/ggml-model-q4_0.ggmf.bin',
 	ggjt: './models/7B/ggml-model-q4_0.bin',
 }

 const csv = await open('./result.csv', 'w')
 csv.appendFile('commit,format,load time,sample time,prompt eval time,eval time,total time,eval token time,max memory rss (KB)\n')

 function getMemoryUsageOf (pid) {
 	return Number(execSync(`ps -p ${pid} -o rss`).subarray(6).toString())
 }

 async function runLlama (commit, format) {
 	return new Promise((resolve) => {
 		const llama = spawn('./main',  [ ...LLAMA_PARAMETERS, '-m', FORMATS[format] ])

 		// Keep used memory information
 		// Ugly way of doing it but I cba to make it better :D
 		let usedMemory = 0
 		const memCheck = setInterval(() => (usedMemory = Math.max(usedMemory, getMemoryUsageOf(llama.pid))), 5e3)

 		let dnf = false
 		const to = setTimeout(() => {
 			dnf = true
 			llama.kill('SIGKILL')
 			console.log('Warn: DNF (timeout)')
 		}, 300e3)

 		const stderrChunks = []
 		llama.stderr.on('data', (d) => stderrChunks.push(d))
 		llama.on('exit', () => {
 			clearInterval(memCheck)
 			clearTimeout(to)
 			// DNF; go next
 			if (dnf) return resolve()

 			const stderr = Buffer.concat(stderrChunks).toString()

 			const timings = stderr.matchAll(/(?:llama_print_timings|main):.*?time =\s+([\d\.]+)/g)
 			const tokTime = stderr.match(/eval time.*?\(\s+([\d\.]+) ms per run|\/\s+([\d.]+) ms/)

 			const loadTime = timings.next().value[1]
 			const sampleTime = timings.next().value[1]
 			const promptEvalTime = stderr.includes('prompt eval time') ? timings.next().value[1] : ''
 			const evalTime = timings.next().value[1]
 			const totalTime = timings.next().value[1]
 			const tokenTime = tokTime[1] ?? tokTime[2]

 			csv.appendFile(`${commit},${format},${loadTime},${sampleTime},${promptEvalTime},${evalTime},${totalTime},${tokenTime},${usedMemory}\n`)
 			resolve()
 		})
 	})
 }

 function checkoutLlama (commit) {
 	execSync('make clean', { stdio: 'ignore' })
 	execSync(`git checkout ${commit}`, { stdio: 'ignore' })
 	execSync('make -j main', { stdio: 'ignore' })
 }

 function getCommits () {
 	// This function excludes commits which did not modify llama-related files.
 	// It's not made in the most efficient way, but whatever :D

 	const allCommits = execSync('git log --pretty=format:"%H"').toString().split('\n')

 	const importantCommits = new Set()
 	for (const file of IMPORTANT_FILES) {
 		execSync(`git log --pretty=format:"%H" --follow '${file}'`).toString().split('\n')
 			.forEach((c) => importantCommits.add(c))
 	}

 	return allCommits.slice(0, allCommits.indexOf(LLAMA_START_AT) + 1).reverse()
 		.filter((c) => importantCommits.has(c))
 }

 const commits = getCommits()
 const ggmfChangeIdx = commits.indexOf(LLAMA_GGMF_SINCE)
 const ggjtChangeIdx = commits.indexOf(LLAMA_GGJT_SINCE)
 for (let i = 0; i < commits.length; i++) {
 	const commit = commits[i]
 	const format = i < ggmfChangeIdx
 		? 'unversioned'
 		: i < ggjtChangeIdx
 			? 'ggmf'
 			: 'ggjt'


 	console.log('Processing commit %s (%d/%d)', commit, i, commits.length)
 	try {
 		checkoutLlama(commit)
 	} catch {
 		console.log('Warn: DNF (compile error)')
 		continue
 	}

 	try {
 		await runLlama(commit, format)
 	} catch {
 		console.log('Warn: DNF (runtime error)')
 		continue
 	}
 }
	/*!
	* BSD Zero Clause License
	* Copyright (c) Cynthia Rey
	*
	* Permission to use, copy, modify, and/or distribute this software for any
	* purpose with or without fee is hereby granted.
	*
	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
	* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
	* AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
	* INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
	* LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
	* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
	* PERFORMANCE OF THIS SOFTWARE.
	*/

	import { spawn, execSync } from 'child_process'
	import { open } from 'fs/promises'

	const LLAMA_PARAMETERS = [
	'-t', '4',
	'-p', 'Here is a long story about how programming came to be:',
	'-n', '256',
	'-c', '1024',
	'--top_k', '40',
	'--top_p', '0.95',
	'--repeat_last_n', '64',
	'--repeat_penalty', '1.1',
	]

	const LLAMA_START_AT = '2d64715ad475f192a4004a52d134c67ccb6f44ad'

	const LLAMA_GGMF_SINCE = '074bea2eb1f1349a0118239c4152914aecaa1be4'
	const LLAMA_GGJT_SINCE = '78ca9838ee36660a776e97e3391b6fb5dcaacf7f'

	const IMPORTANT_FILES = [
	'ggml.c', 'ggml.h',
	'llama.cpp', 'llama.h',
	'examples/common.cpp', 'examples/common.h',
	'examples/main/main.cpp',
	]

	const FORMATS = {
	unversioned: './models/7B/ggml-model-q4_0.unversioned.bin',
	ggmf: './models/7B/ggml-model-q4_0.ggmf.bin',
	ggjt: './models/7B/ggml-model-q4_0.bin',
	}

	const csv = await open('./result.csv', 'w')
	csv.appendFile('commit,format,load time,sample time,prompt eval time,eval time,total time,eval token time,max memory rss (KB)\n')

	function getMemoryUsageOf (pid) {
	return Number(execSync(`ps -p ${pid} -o rss`).subarray(6).toString())
	}

	async function runLlama (commit, format) {
	return new Promise((resolve) => {
	const llama = spawn('./main', [ ...LLAMA_PARAMETERS, '-m', FORMATS[format] ])

	// Keep used memory information
	// Ugly way of doing it but I cba to make it better :D
	let usedMemory = 0
	const memCheck = setInterval(() => (usedMemory = Math.max(usedMemory, getMemoryUsageOf(llama.pid))), 5e3)

	let dnf = false
	const to = setTimeout(() => {
	dnf = true
	llama.kill('SIGKILL')
	console.log('Warn: DNF (timeout)')
	}, 300e3)

	const stderrChunks = []
	llama.stderr.on('data', (d) => stderrChunks.push(d))
	llama.on('exit', () => {
	clearInterval(memCheck)
	clearTimeout(to)
	// DNF; go next
	if (dnf) return resolve()

	const stderr = Buffer.concat(stderrChunks).toString()

	const timings = stderr.matchAll(/(?:llama_print_timings\|main):.*?time =\s+([\d\.]+)/g)
	const tokTime = stderr.match(/eval time.*?\(\s+([\d\.]+) ms per run\|\/\s+([\d.]+) ms/)

	const loadTime = timings.next().value[1]
	const sampleTime = timings.next().value[1]
	const promptEvalTime = stderr.includes('prompt eval time') ? timings.next().value[1] : ''
	const evalTime = timings.next().value[1]
	const totalTime = timings.next().value[1]
	const tokenTime = tokTime[1] ?? tokTime[2]

	csv.appendFile(`${commit},${format},${loadTime},${sampleTime},${promptEvalTime},${evalTime},${totalTime},${tokenTime},${usedMemory}\n`)
	resolve()
	})
	})
	}

	function checkoutLlama (commit) {
	execSync('make clean', { stdio: 'ignore' })
	execSync(`git checkout ${commit}`, { stdio: 'ignore' })
	execSync('make -j main', { stdio: 'ignore' })
	}

	function getCommits () {
	// This function excludes commits which did not modify llama-related files.
	// It's not made in the most efficient way, but whatever :D

	const allCommits = execSync('git log --pretty=format:"%H"').toString().split('\n')

	const importantCommits = new Set()
	for (const file of IMPORTANT_FILES) {
	execSync(`git log --pretty=format:"%H" --follow '${file}'`).toString().split('\n')
	.forEach((c) => importantCommits.add(c))
	}

	return allCommits.slice(0, allCommits.indexOf(LLAMA_START_AT) + 1).reverse()
	.filter((c) => importantCommits.has(c))
	}

	const commits = getCommits()
	const ggmfChangeIdx = commits.indexOf(LLAMA_GGMF_SINCE)
	const ggjtChangeIdx = commits.indexOf(LLAMA_GGJT_SINCE)
	for (let i = 0; i < commits.length; i++) {
	const commit = commits[i]
	const format = i < ggmfChangeIdx
	? 'unversioned'
	: i < ggjtChangeIdx
	? 'ggmf'
	: 'ggjt'


	console.log('Processing commit %s (%d/%d)', commit, i, commits.length)
	try {
	checkoutLlama(commit)
	} catch {
	console.log('Warn: DNF (compile error)')
	continue
	}

	try {
	await runLlama(commit, format)
	} catch {
	console.log('Warn: DNF (runtime error)')
	continue
	}
	}