jwpeterson · October 3, 2018 22:51
diff --git a/key_performance_test.cc b/key_performance_test.cc
 #include "libmesh/mesh_generation.h"
 #include "libmesh/mesh.h"
 #include "libmesh/elem.h"
 #include "libmesh/getpot.h"
 #include "libmesh/string_to_enum.h"
 #include "libmesh/elem_range.h"

 // C++ includes
 #include <chrono>

 using namespace libMesh;

 // This program looks at the performance of the Elem::key() function
 // both before and after the std::array optimizations.
 //
 // Timings before the std::array optimization (Mac Pro 2.7 GHz):
 // 1a.) HEX27, the most expensive element to hash (elapsed time per loop).
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
 // np.mean([0.238691,0.239391,0.238902,0.238897,0.244535]) = 0.2400832
 //
 // 2a.) TRI3, one of the cheapest elements to hash (total elapsed time).
 // ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
 // np.mean([0.911238,0.89935,0.90599,0.87904,0.902818,0.907416,0.884893,0.908562,0.927676,0.880153]) = 0.9007136
 //
 // Timings _after_ the std::array optimization (Mac Pro 2.7 GHz):
 // 1b.) HEX27, the most expensive element to hash (elapsed time per loop).
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
 // np.mean([0.173798,0.170249,0.171532,0.168717,0.170337]) = 0.1709266
 //
 // 2b.) TRI3, one of the cheapest elements to hash (total elapsed time).
 // ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
 // np.mean([0.875458,0.904809,0.893347,0.90164,0.909899,0.894806,0.878814,0.902609,0.907773,0.911029]) = 0.8980184
 //
 // (single-threaded) Speedup
 // Case 1: 0.2400832 / 0.1709266 = 1.4
 // Case 2: 0.9007136 / 0.8980184 = 1.00
 //
 // --------------------------------------------------------------------------------
 //
 // Multithreaded timings _before_ the std::array optimization (total elapsed time).
 // a. One thread
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
 // np.mean([0.954329,0.974633,0.953297,0.983907,0.949742,]) = 0.9631816
 //
 // b.) Two threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
 // np.mean([0.541939,0.528525,0.505729,0.532151,0.51816,]) = 0.5253008
 //
 // c.) Three threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
 // np.mean([0.394029,0.353447,0.39675,0.397024,0.404526,]) = 0.3891552
 //
 // d.) Four threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
 // np.mean([0.36172,0.339432,0.338645,0.355434,0.345501,]) = 0.3481464
 //
 // e.) Five threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
 // np.mean([0.223091,0.27234,0.237944,0.282024,0.222382]) = 0.2475562
 //
 // f.) Six threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
 // np.mean([0.241564,0.205035,0.240761,0.235877,0.239974]) = 0.2326422
 //
 // g.) Seven threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
 // np.mean([0.174728,0.206139,0.215146,0.213032,0.211148]) = 0.2040386
 //
 // h.) Eight threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
 // np.mean([0.18606,0.192954,0.184264,0.16539,0.193858]) = 0.1845052
 //
 // --------------------------------------------------------------------------------
 //
 // Multithreaded timings _after_ the std::array optimization (total elapsed time).
 // a. One thread
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
 // np.mean([0.674726,0.688726,0.675761,0.680911,0.676321,0.685274,0.681034,0.683756]) = 0.680813625
 //
 // b.) Two threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
 // np.mean([0.369587,0.361204,0.387461,0.363354,0.365055,0.362668,0.365292,0.364438]) = 0.367382375
 //
 // c.) Three threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
 // np.mean([0.253700,0.254675,0.300733,0.257468,0.301073,0.279767,0.27771,0.256436]) = 0.27269525
 //
 // d.) Four threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
 // np.mean([0.242026,0.251493,0.237071,0.211548,0.240836,0.209865,0.25133,0.249079]) = 0.236656
 //
 // e.) Five threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
 // np.mean([0.183068,0.196724,0.201339,0.206538,0.192243]) = 0.1959824
 //
 // f.) Six threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
 // np.mean([0.183316,0.180756,0.144275,0.180142,0.178662]) = 0.1734302
 //
 // g.) Seven threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
 // np.mean([0.157155,0.142657,0.157594,0.13912,0.157656]) = 0.1508364
 //
 // h.) Eight threads
 // ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
 // np.mean([0.141037,0.146088,0.132373,0.145368,0.138546]) = 0.1406824








 /**
 * For use when debugging: compare single-threaded algorithm to
 * multi-threaded algorithm running on one thread, make sure they get
 * the same result.
 */
 // #define SINGLE_THREADED 1



 /**
 * For multi-threaded version: a functor that can be called in a parallel_for loop.
 */
 class Body
 {
 public:
  Body () : key_sum() {}
  Body (Body &, Threads::split) : key_sum(0) {}
  ~Body() = default;

  std::size_t key_sum;

  void operator() (const ConstElemRange & range)
  {
    for (const auto & elem : range)
      key_sum += elem->key();
  }

  void join(const Body & other)
  {
    key_sum += other.key_sum;
  }
 };



 /**
 * Main program.
 */
 int main (int argc, char ** argv)
 {
  LibMeshInit init (argc, argv);
  GetPot command_line (argc, argv);

  // Parse command line arguments
  unsigned dim = 2;
  if (command_line.search(2, "-d", "--dim"))
    dim = command_line.next(dim);

  std::string elem_type = "QUAD4";
  if (command_line.search(2, "-e", "--elem_type"))
    elem_type = command_line.next(elem_type);

  ElemType et = Utility::string_to_enum<ElemType>(elem_type);

  // Since building the mesh takes a lot longer than computing keys,
  // we can get longer duration timing by computing the keys multiple
  // times.
  int num_loops = 10;
  if (command_line.search(2, "-L", "--num-loops"))
    num_loops = command_line.next(num_loops);

  // Print thread status message
  libMesh::out << "Running on " << libMesh::n_threads() << " thread(s)." << std::endl;

  Mesh mesh(init.comm(), dim);

  // If true, print more output.
  bool verbose = false;
  if (command_line.search(2, "-v", "--verbose"))
    verbose = true;

  unsigned n_elem = 10;
  if (command_line.search(2, "-n", "--n_elem"))
    n_elem = command_line.next(n_elem);

  // Note: you _must_ have an equal sign, i.e. --n-threads=2, you _cannot_ have a space
  // between the argument and the value. The following code can be used to ensure that
  // it is working...
  // std::vector<std::string> n_threads(2);
  // n_threads[0] = "--n_threads";
  // n_threads[1] = "--n-threads";
  // int nt = libMesh::command_line_value (n_threads, 1);
  // libMesh::out << "Number of threads read from command line " << nt << std::endl;

  if (verbose)
    libMesh::out << "Building " << n_elem << "^" << dim << " mesh." << std::endl;

  if (dim==3)
    {
      // Note: An grid of N^3 HEX8's will have 24*N^3 TET4's.
      // The hashing function for a Tetrahedral face only involves
      // 3 nodes, and hence is different from the hashing function for
      // a Hex, which involves 4.
      MeshTools::Generation::build_cube (mesh,
                                         n_elem, n_elem, n_elem,
                                         -1., 1.,
                                         -1., 1.,
                                         -1., 1.,
                                         et);
    }
  else
    {
      // This will test 2-node hashes
      MeshTools::Generation::build_square (mesh,
                                           n_elem, n_elem,
                                           -1., 1.,
                                           -1., 1.,
                                           et);
    }

  if (verbose)
    libMesh::out << "Constructing range... " << std::endl;


  // Single-threaded version:
  // To minimize the impact of iteration itself in the timing, we'll
  // make a flat vector of Elem* first.
 #ifdef SINGLE_THREADED
  std::vector<Elem *> elem_vec;
  elem_vec.reserve(mesh.n_nodes());
  for (const auto & elem : mesh.element_ptr_range())
    elem_vec.push_back(elem);
 #else
  // Multi-threaded version:
  ConstElemRange elem_range (mesh.elements_begin(), mesh.elements_end());
  Body body;
 #endif

  if (verbose)
    libMesh::out << "Computing keys... " << std::endl;

  // Keep the compiler honest by maintaining a running sum of the key
  // values. This might overflow (?) but who cares.
 #ifdef SINGLE_THREADED
  std::size_t key_sum = 0;
 #endif

  typedef std::chrono::system_clock clock_type;
  std::chrono::time_point<clock_type> start = clock_type::now();

 #ifdef SINGLE_THREADED
  // Single-threaded version
  for (int loop=0; loop<num_loops; ++loop)
    for (const auto & elem : elem_vec)
      key_sum += elem->key();
 #else
  // Multi-threaded (parallel_reduce) version.
  for (int loop=0; loop<num_loops; ++loop)
    Threads::parallel_reduce (elem_range, body);
 #endif

  std::chrono::time_point<clock_type> stop = clock_type::now();
  clock_type::duration dur = stop - start;
  Real elapsed = std::chrono::duration_cast<std::chrono::microseconds>(dur).count() * 1.e-6;

  // Report results
 #ifdef SINGLE_THREADED
  libMesh::out << "key_sum = " << key_sum << std::endl;
 #else
  libMesh::out << "key_sum = " << body.key_sum << std::endl;
 #endif
  libMesh::out << "Total elapsed time: " << elapsed << " s\n";
  libMesh::out << "Elapsed time/loop: " << elapsed / num_loops << " s\n";

  return 0;
 }
	#include "libmesh/mesh_generation.h"
	#include "libmesh/mesh.h"
	#include "libmesh/elem.h"
	#include "libmesh/getpot.h"
	#include "libmesh/string_to_enum.h"
	#include "libmesh/elem_range.h"

	// C++ includes
	#include <chrono>

	using namespace libMesh;

	// This program looks at the performance of the Elem::key() function
	// both before and after the std::array optimizations.
	//
	// Timings before the std::array optimization (Mac Pro 2.7 GHz):
	// 1a.) HEX27, the most expensive element to hash (elapsed time per loop).
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
	// np.mean([0.238691,0.239391,0.238902,0.238897,0.244535]) = 0.2400832
	//
	// 2a.) TRI3, one of the cheapest elements to hash (total elapsed time).
	// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
	// np.mean([0.911238,0.89935,0.90599,0.87904,0.902818,0.907416,0.884893,0.908562,0.927676,0.880153]) = 0.9007136
	//
	// Timings _after_ the std::array optimization (Mac Pro 2.7 GHz):
	// 1b.) HEX27, the most expensive element to hash (elapsed time per loop).
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
	// np.mean([0.173798,0.170249,0.171532,0.168717,0.170337]) = 0.1709266
	//
	// 2b.) TRI3, one of the cheapest elements to hash (total elapsed time).
	// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
	// np.mean([0.875458,0.904809,0.893347,0.90164,0.909899,0.894806,0.878814,0.902609,0.907773,0.911029]) = 0.8980184
	//
	// (single-threaded) Speedup
	// Case 1: 0.2400832 / 0.1709266 = 1.4
	// Case 2: 0.9007136 / 0.8980184 = 1.00
	//
	// --------------------------------------------------------------------------------
	//
	// Multithreaded timings _before_ the std::array optimization (total elapsed time).
	// a. One thread
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
	// np.mean([0.954329,0.974633,0.953297,0.983907,0.949742,]) = 0.9631816
	//
	// b.) Two threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
	// np.mean([0.541939,0.528525,0.505729,0.532151,0.51816,]) = 0.5253008
	//
	// c.) Three threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
	// np.mean([0.394029,0.353447,0.39675,0.397024,0.404526,]) = 0.3891552
	//
	// d.) Four threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
	// np.mean([0.36172,0.339432,0.338645,0.355434,0.345501,]) = 0.3481464
	//
	// e.) Five threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
	// np.mean([0.223091,0.27234,0.237944,0.282024,0.222382]) = 0.2475562
	//
	// f.) Six threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
	// np.mean([0.241564,0.205035,0.240761,0.235877,0.239974]) = 0.2326422
	//
	// g.) Seven threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
	// np.mean([0.174728,0.206139,0.215146,0.213032,0.211148]) = 0.2040386
	//
	// h.) Eight threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
	// np.mean([0.18606,0.192954,0.184264,0.16539,0.193858]) = 0.1845052
	//
	// --------------------------------------------------------------------------------
	//
	// Multithreaded timings _after_ the std::array optimization (total elapsed time).
	// a. One thread
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
	// np.mean([0.674726,0.688726,0.675761,0.680911,0.676321,0.685274,0.681034,0.683756]) = 0.680813625
	//
	// b.) Two threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
	// np.mean([0.369587,0.361204,0.387461,0.363354,0.365055,0.362668,0.365292,0.364438]) = 0.367382375
	//
	// c.) Three threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
	// np.mean([0.253700,0.254675,0.300733,0.257468,0.301073,0.279767,0.27771,0.256436]) = 0.27269525
	//
	// d.) Four threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
	// np.mean([0.242026,0.251493,0.237071,0.211548,0.240836,0.209865,0.25133,0.249079]) = 0.236656
	//
	// e.) Five threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
	// np.mean([0.183068,0.196724,0.201339,0.206538,0.192243]) = 0.1959824
	//
	// f.) Six threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
	// np.mean([0.183316,0.180756,0.144275,0.180142,0.178662]) = 0.1734302
	//
	// g.) Seven threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
	// np.mean([0.157155,0.142657,0.157594,0.13912,0.157656]) = 0.1508364
	//
	// h.) Eight threads
	// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
	// np.mean([0.141037,0.146088,0.132373,0.145368,0.138546]) = 0.1406824








	/**
	* For use when debugging: compare single-threaded algorithm to
	* multi-threaded algorithm running on one thread, make sure they get
	* the same result.
	*/
	// #define SINGLE_THREADED 1



	/**
	* For multi-threaded version: a functor that can be called in a parallel_for loop.
	*/
	class Body
	{
	public:
	Body () : key_sum() {}
	Body (Body &, Threads::split) : key_sum(0) {}
	~Body() = default;

	std::size_t key_sum;

	void operator() (const ConstElemRange & range)
	{
	for (const auto & elem : range)
	key_sum += elem->key();
	}

	void join(const Body & other)
	{
	key_sum += other.key_sum;
	}
	};



	/**
	* Main program.
	*/
	int main (int argc, char ** argv)
	{
	LibMeshInit init (argc, argv);
	GetPot command_line (argc, argv);

	// Parse command line arguments
	unsigned dim = 2;
	if (command_line.search(2, "-d", "--dim"))
	dim = command_line.next(dim);

	std::string elem_type = "QUAD4";
	if (command_line.search(2, "-e", "--elem_type"))
	elem_type = command_line.next(elem_type);

	ElemType et = Utility::string_to_enum<ElemType>(elem_type);

	// Since building the mesh takes a lot longer than computing keys,
	// we can get longer duration timing by computing the keys multiple
	// times.
	int num_loops = 10;
	if (command_line.search(2, "-L", "--num-loops"))
	num_loops = command_line.next(num_loops);

	// Print thread status message
	libMesh::out << "Running on " << libMesh::n_threads() << " thread(s)." << std::endl;

	Mesh mesh(init.comm(), dim);

	// If true, print more output.
	bool verbose = false;
	if (command_line.search(2, "-v", "--verbose"))
	verbose = true;

	unsigned n_elem = 10;
	if (command_line.search(2, "-n", "--n_elem"))
	n_elem = command_line.next(n_elem);

	// Note: you _must_ have an equal sign, i.e. --n-threads=2, you _cannot_ have a space
	// between the argument and the value. The following code can be used to ensure that
	// it is working...
	// std::vector<std::string> n_threads(2);
	// n_threads[0] = "--n_threads";
	// n_threads[1] = "--n-threads";
	// int nt = libMesh::command_line_value (n_threads, 1);
	// libMesh::out << "Number of threads read from command line " << nt << std::endl;

	if (verbose)
	libMesh::out << "Building " << n_elem << "^" << dim << " mesh." << std::endl;

	if (dim==3)
	{
	// Note: An grid of N^3 HEX8's will have 24*N^3 TET4's.
	// The hashing function for a Tetrahedral face only involves
	// 3 nodes, and hence is different from the hashing function for
	// a Hex, which involves 4.
	MeshTools::Generation::build_cube (mesh,
	n_elem, n_elem, n_elem,
	-1., 1.,
	-1., 1.,
	-1., 1.,
	et);
	}
	else
	{
	// This will test 2-node hashes
	MeshTools::Generation::build_square (mesh,
	n_elem, n_elem,
	-1., 1.,
	-1., 1.,
	et);
	}

	if (verbose)
	libMesh::out << "Constructing range... " << std::endl;


	// Single-threaded version:
	// To minimize the impact of iteration itself in the timing, we'll
	// make a flat vector of Elem* first.
	#ifdef SINGLE_THREADED
	std::vector<Elem *> elem_vec;
	elem_vec.reserve(mesh.n_nodes());
	for (const auto & elem : mesh.element_ptr_range())
	elem_vec.push_back(elem);
	#else
	// Multi-threaded version:
	ConstElemRange elem_range (mesh.elements_begin(), mesh.elements_end());
	Body body;
	#endif

	if (verbose)
	libMesh::out << "Computing keys... " << std::endl;

	// Keep the compiler honest by maintaining a running sum of the key
	// values. This might overflow (?) but who cares.
	#ifdef SINGLE_THREADED
	std::size_t key_sum = 0;
	#endif

	typedef std::chrono::system_clock clock_type;
	std::chrono::time_point<clock_type> start = clock_type::now();

	#ifdef SINGLE_THREADED
	// Single-threaded version
	for (int loop=0; loop<num_loops; ++loop)
	for (const auto & elem : elem_vec)
	key_sum += elem->key();
	#else
	// Multi-threaded (parallel_reduce) version.
	for (int loop=0; loop<num_loops; ++loop)
	Threads::parallel_reduce (elem_range, body);
	#endif

	std::chrono::time_point<clock_type> stop = clock_type::now();
	clock_type::duration dur = stop - start;
	Real elapsed = std::chrono::duration_cast<std::chrono::microseconds>(dur).count() * 1.e-6;

	// Report results
	#ifdef SINGLE_THREADED
	libMesh::out << "key_sum = " << key_sum << std::endl;
	#else
	libMesh::out << "key_sum = " << body.key_sum << std::endl;
	#endif
	libMesh::out << "Total elapsed time: " << elapsed << " s\n";
	libMesh::out << "Elapsed time/loop: " << elapsed / num_loops << " s\n";

	return 0;
	}