Last active
October 3, 2018 22:51
-
-
Save jwpeterson/5b4f0fd0d92939c618ea3a976b3808cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "libmesh/mesh_generation.h" | |
#include "libmesh/mesh.h" | |
#include "libmesh/elem.h" | |
#include "libmesh/getpot.h" | |
#include "libmesh/string_to_enum.h" | |
#include "libmesh/elem_range.h" | |
// C++ includes | |
#include <chrono> | |
using namespace libMesh; | |
// This program looks at the performance of the Elem::key() function | |
// both before and after the std::array optimizations. | |
// | |
// Timings before the std::array optimization (Mac Pro 2.7 GHz): | |
// 1a.) HEX27, the most expensive element to hash (elapsed time per loop). | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 | |
// np.mean([0.238691,0.239391,0.238902,0.238897,0.244535]) = 0.2400832 | |
// | |
// 2a.) TRI3, one of the cheapest elements to hash (total elapsed time). | |
// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100 | |
// np.mean([0.911238,0.89935,0.90599,0.87904,0.902818,0.907416,0.884893,0.908562,0.927676,0.880153]) = 0.9007136 | |
// | |
// Timings _after_ the std::array optimization (Mac Pro 2.7 GHz): | |
// 1b.) HEX27, the most expensive element to hash (elapsed time per loop). | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 | |
// np.mean([0.173798,0.170249,0.171532,0.168717,0.170337]) = 0.1709266 | |
// | |
// 2b.) TRI3, one of the cheapest elements to hash (total elapsed time). | |
// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100 | |
// np.mean([0.875458,0.904809,0.893347,0.90164,0.909899,0.894806,0.878814,0.902609,0.907773,0.911029]) = 0.8980184 | |
// | |
// (single-threaded) Speedup | |
// Case 1: 0.2400832 / 0.1709266 = 1.4 | |
// Case 2: 0.9007136 / 0.8980184 = 1.00 | |
// | |
// -------------------------------------------------------------------------------- | |
// | |
// Multithreaded timings _before_ the std::array optimization (total elapsed time). | |
// a. One thread | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1 | |
// np.mean([0.954329,0.974633,0.953297,0.983907,0.949742,]) = 0.9631816 | |
// | |
// b.) Two threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2 | |
// np.mean([0.541939,0.528525,0.505729,0.532151,0.51816,]) = 0.5253008 | |
// | |
// c.) Three threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3 | |
// np.mean([0.394029,0.353447,0.39675,0.397024,0.404526,]) = 0.3891552 | |
// | |
// d.) Four threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4 | |
// np.mean([0.36172,0.339432,0.338645,0.355434,0.345501,]) = 0.3481464 | |
// | |
// e.) Five threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5 | |
// np.mean([0.223091,0.27234,0.237944,0.282024,0.222382]) = 0.2475562 | |
// | |
// f.) Six threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6 | |
// np.mean([0.241564,0.205035,0.240761,0.235877,0.239974]) = 0.2326422 | |
// | |
// g.) Seven threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7 | |
// np.mean([0.174728,0.206139,0.215146,0.213032,0.211148]) = 0.2040386 | |
// | |
// h.) Eight threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8 | |
// np.mean([0.18606,0.192954,0.184264,0.16539,0.193858]) = 0.1845052 | |
// | |
// -------------------------------------------------------------------------------- | |
// | |
// Multithreaded timings _after_ the std::array optimization (total elapsed time). | |
// a. One thread | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1 | |
// np.mean([0.674726,0.688726,0.675761,0.680911,0.676321,0.685274,0.681034,0.683756]) = 0.680813625 | |
// | |
// b.) Two threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2 | |
// np.mean([0.369587,0.361204,0.387461,0.363354,0.365055,0.362668,0.365292,0.364438]) = 0.367382375 | |
// | |
// c.) Three threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3 | |
// np.mean([0.253700,0.254675,0.300733,0.257468,0.301073,0.279767,0.27771,0.256436]) = 0.27269525 | |
// | |
// d.) Four threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4 | |
// np.mean([0.242026,0.251493,0.237071,0.211548,0.240836,0.209865,0.25133,0.249079]) = 0.236656 | |
// | |
// e.) Five threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5 | |
// np.mean([0.183068,0.196724,0.201339,0.206538,0.192243]) = 0.1959824 | |
// | |
// f.) Six threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6 | |
// np.mean([0.183316,0.180756,0.144275,0.180142,0.178662]) = 0.1734302 | |
// | |
// g.) Seven threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7 | |
// np.mean([0.157155,0.142657,0.157594,0.13912,0.157656]) = 0.1508364 | |
// | |
// h.) Eight threads | |
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8 | |
// np.mean([0.141037,0.146088,0.132373,0.145368,0.138546]) = 0.1406824 | |
/** | |
* For use when debugging: compare single-threaded algorithm to | |
* multi-threaded algorithm running on one thread, make sure they get | |
* the same result. | |
*/ | |
// #define SINGLE_THREADED 1 | |
/** | |
* For multi-threaded version: a functor that can be called in a parallel_for loop. | |
*/ | |
class Body | |
{ | |
public: | |
Body () : key_sum() {} | |
Body (Body &, Threads::split) : key_sum(0) {} | |
~Body() = default; | |
std::size_t key_sum; | |
void operator() (const ConstElemRange & range) | |
{ | |
for (const auto & elem : range) | |
key_sum += elem->key(); | |
} | |
void join(const Body & other) | |
{ | |
key_sum += other.key_sum; | |
} | |
}; | |
/** | |
* Main program. | |
*/ | |
int main (int argc, char ** argv) | |
{ | |
LibMeshInit init (argc, argv); | |
GetPot command_line (argc, argv); | |
// Parse command line arguments | |
unsigned dim = 2; | |
if (command_line.search(2, "-d", "--dim")) | |
dim = command_line.next(dim); | |
std::string elem_type = "QUAD4"; | |
if (command_line.search(2, "-e", "--elem_type")) | |
elem_type = command_line.next(elem_type); | |
ElemType et = Utility::string_to_enum<ElemType>(elem_type); | |
// Since building the mesh takes a lot longer than computing keys, | |
// we can get longer duration timing by computing the keys multiple | |
// times. | |
int num_loops = 10; | |
if (command_line.search(2, "-L", "--num-loops")) | |
num_loops = command_line.next(num_loops); | |
// Print thread status message | |
libMesh::out << "Running on " << libMesh::n_threads() << " thread(s)." << std::endl; | |
Mesh mesh(init.comm(), dim); | |
// If true, print more output. | |
bool verbose = false; | |
if (command_line.search(2, "-v", "--verbose")) | |
verbose = true; | |
unsigned n_elem = 10; | |
if (command_line.search(2, "-n", "--n_elem")) | |
n_elem = command_line.next(n_elem); | |
// Note: you _must_ have an equal sign, i.e. --n-threads=2, you _cannot_ have a space | |
// between the argument and the value. The following code can be used to ensure that | |
// it is working... | |
// std::vector<std::string> n_threads(2); | |
// n_threads[0] = "--n_threads"; | |
// n_threads[1] = "--n-threads"; | |
// int nt = libMesh::command_line_value (n_threads, 1); | |
// libMesh::out << "Number of threads read from command line " << nt << std::endl; | |
if (verbose) | |
libMesh::out << "Building " << n_elem << "^" << dim << " mesh." << std::endl; | |
if (dim==3) | |
{ | |
// Note: An grid of N^3 HEX8's will have 24*N^3 TET4's. | |
// The hashing function for a Tetrahedral face only involves | |
// 3 nodes, and hence is different from the hashing function for | |
// a Hex, which involves 4. | |
MeshTools::Generation::build_cube (mesh, | |
n_elem, n_elem, n_elem, | |
-1., 1., | |
-1., 1., | |
-1., 1., | |
et); | |
} | |
else | |
{ | |
// This will test 2-node hashes | |
MeshTools::Generation::build_square (mesh, | |
n_elem, n_elem, | |
-1., 1., | |
-1., 1., | |
et); | |
} | |
if (verbose) | |
libMesh::out << "Constructing range... " << std::endl; | |
// Single-threaded version: | |
// To minimize the impact of iteration itself in the timing, we'll | |
// make a flat vector of Elem* first. | |
#ifdef SINGLE_THREADED | |
std::vector<Elem *> elem_vec; | |
elem_vec.reserve(mesh.n_nodes()); | |
for (const auto & elem : mesh.element_ptr_range()) | |
elem_vec.push_back(elem); | |
#else | |
// Multi-threaded version: | |
ConstElemRange elem_range (mesh.elements_begin(), mesh.elements_end()); | |
Body body; | |
#endif | |
if (verbose) | |
libMesh::out << "Computing keys... " << std::endl; | |
// Keep the compiler honest by maintaining a running sum of the key | |
// values. This might overflow (?) but who cares. | |
#ifdef SINGLE_THREADED | |
std::size_t key_sum = 0; | |
#endif | |
typedef std::chrono::system_clock clock_type; | |
std::chrono::time_point<clock_type> start = clock_type::now(); | |
#ifdef SINGLE_THREADED | |
// Single-threaded version | |
for (int loop=0; loop<num_loops; ++loop) | |
for (const auto & elem : elem_vec) | |
key_sum += elem->key(); | |
#else | |
// Multi-threaded (parallel_reduce) version. | |
for (int loop=0; loop<num_loops; ++loop) | |
Threads::parallel_reduce (elem_range, body); | |
#endif | |
std::chrono::time_point<clock_type> stop = clock_type::now(); | |
clock_type::duration dur = stop - start; | |
Real elapsed = std::chrono::duration_cast<std::chrono::microseconds>(dur).count() * 1.e-6; | |
// Report results | |
#ifdef SINGLE_THREADED | |
libMesh::out << "key_sum = " << key_sum << std::endl; | |
#else | |
libMesh::out << "key_sum = " << body.key_sum << std::endl; | |
#endif | |
libMesh::out << "Total elapsed time: " << elapsed << " s\n"; | |
libMesh::out << "Elapsed time/loop: " << elapsed / num_loops << " s\n"; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment