// bslmt_throughputbenchmark.h                                        -*-C++-*-

#ifndef INCLUDED_BSLMT_THROUGHPUTBENCHMARK
#define INCLUDED_BSLMT_THROUGHPUTBENCHMARK

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide a performance test harness for multi-threaded components.
//
//@CLASSES:
//  bslmt::ThroughputBenchmark: multi-threaded performance test harness
//
//@DESCRIPTION: This component defines a mechanism,
// 'bslmt::ThroughputBenchmark', that provides performance testing for multi-
// threaded components.  The results are loaded into a
// 'bslmt::ThroughputBenchmarkResult' object, which provides access to counts
// of the work done by each thread, thread group, and sample, divided by the
// number of actual seconds of execution.
//
///Structure of a Test
///-------------------
// A test is composed from one or more thread groups, each running one or more
// threads.  Each thread in a thread group executes a thread function, with a
// simulated work load executing between subsequent calls to the thread
// function.  To provide reliability, the test is executed multiple times.  A
// single execution of a test is referred to as a "sample execution" and its
// result referred to as a "sample".  To support fine tuning of the test, it is
// possible to provide initialize and cleanup functions for a sample and / or a
// thread.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Test Performance of bsl::queue<int>
/// - - - - - - - - - - - - - - - - - - - - - - -
// In the following example we test the throughput of a 'bsl::queue<int>' in a
// multi-threaded environment, where multiple "producer" threads are pushing
// elements, and multiple "consumer" threads are popping these elements.
//
// First, we define a global queue, a mutex to protect this queue, and a
// semaphore for a "pop" operation to block on:
//..
//  bsl::queue<int>  myQueue;
//  bslmt::Mutex     myMutex;
//  bslmt::Semaphore mySem;
//..
// Next, we define a counter value we push in:
//..
//  int              counterValue = 0;
//..
// Then, we define simple push and pop functions that manipulate this queue:
//..
//  void myPush(int threadIndex)
//      // Push an element into 'myQueue', using the specified 'threadIndex'.
//  {
//      bslmt::LockGuard<bslmt::Mutex> guard(&myMutex);
//      myQueue.push(1000000 * threadIndex + counterValue++);
//      mySem.post();
//  }
//
//  void myPop(int)
//      // Pop an element from 'myQueue'.
//  {
//      mySem.wait();
//      bslmt::LockGuard<bslmt::Mutex> guard(&myMutex);
//      myQueue.pop();
//  }
//..
// Next, we define a thread "cleanup" function for the push thread group, which
// pushes a couple of extra elements to make sure that the pop thread group
// will not hang on an empty queue:
//..
//  void myCleanup()
//      // Cleanup function.
//  {
//      bslmt::LockGuard<bslmt::Mutex> guard(&myMutex);
//      for (int i = 0; i < 10; ++i) {
//          myQueue.push(counterValue++);
//          mySem.post();
//      }
//  }
//..
// Then, we create a 'bslmt::ThroughputBenchmark' object and add push and pop
// thread groups, each with 2 threads and a work load (arithmetic operations to
// consume an amount of time) of 100:
//..
//  bslmt::ThroughputBenchmark myBench;
//  myBench.addThreadGroup(
//                      myPush,
//                      2,
//                      100,
//                      bslmt::ThroughputBenchmark::InitializeThreadFunction(),
//                      myCleanup);
//  const int consumerGroupIdx = myBench.addThreadGroup(myPop, 2, 100);
//..
// Now, we create a 'bslmt::ThroughputBenchmarkResult' object to contain the
// result, and call 'execute' to run the benchmark for 500 millseconds 10
// times:
//..
//  bslmt::ThroughputBenchmarkResult myResult;
//  myBench.execute(&myResult, 500, 10);
//..
// Finally, we print the median of the throughput of the consumer thread group.
//..
//  double median;
//  myResult.getMedian(&median, consumerGroupIdx);
//  bsl::cout << "Throughput:" << median << "\n";
//..

#include <bslscm_version.h>

#include <bslmt_barrier.h>
#include <bslmt_throughputbenchmarkresult.h>

#include <bslma_allocator.h>

#include <bslmf_nestedtraitdeclaration.h>

#include <bsls_assert.h>
#include <bsls_atomic.h>
#include <bsls_timeinterval.h>
#include <bsls_types.h>

#include <bsl_functional.h>
#include <bsl_vector.h>

namespace BloombergLP {
namespace bslmt {

class ThroughputBenchmark_TestUtil;

                        // =========================
                        // class ThroughputBenchmark
                        // =========================

class ThroughputBenchmark {
    // This class is a mechanism that provides performance testing for multi-
    // threaded components.  It allows running different thread functions at
    // the same time, and simulates a work load between subsequent calls to the
    // tested thread functions.  The results are loaded into a
    // 'bslmt::ThroughputBenchmarkResult' object, which provides access to
    // counts of the work done by each thread, thread group, and sample,
    // divided by the number of actual seconds of execution.

  public:
    // PUBLIC TYPES
    typedef bsl::function<void(int)> RunFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void runTest(int threadIndex);
        //      // Run the main part of the benchmark having the specified
        //      // 'threadIndex'.  The behavior is undefined unless
        //      // 'threadIndex' is in the range '[0, numThreadsInGroup)',
        //      // where 'numThreadsInGroup' is the number of threads in a
        //      // thread group for the associated throughput benchmark.
        //..

    typedef bsl::function<void(bool)> InitializeSampleFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void initializeSample(bool isFirst);
        //      // Initialize the sample run.  If the specified 'isFirst' is
        //      // 'true', this is the first sample run.
        //..

    typedef bsl::function<void(bool)> ShutdownSampleFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void shutdownSample(bool isLast);
        //      // Clean up at the end of the sample run, before threads have
        //      // been joined.  If the specified 'isLast' is 'true', this is
        //      // the last sample run.
        //..

    typedef bsl::function<void(bool)> CleanupSampleFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void cleanupSample(bool isLast);
        //      // Clean up after the sample run.  If the specified 'isLast' is
        //      // 'true', this is the last sample run.
        //..

    typedef bsl::function<void()> InitializeThreadFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void initializeThread();
        //      // Initialize each thread in a sample run.
        //..

    typedef bsl::function<void()> CleanupThreadFunction;
        // An alias to a function meeting the following contract:
        //..
        //  void cleanupThread();
        //      // Clean up after each thread in a sample run.
        //..

    struct ThreadGroup {
        // Data used by a thread group

        // PUBLIC DATA
        RunFunction               d_func;         // test function to run

        int                       d_numThreads;   // number of threads in the
                                                  // thread group

        bsls::Types::Int64        d_amount;       // amount of busy work to
                                                  // perform between calls to
                                                  // 'd_func'

        InitializeThreadFunction  d_initialize;   // initialize function per
                                                  // thread

        CleanupThreadFunction     d_cleanup;      // cleanup function per
                                                  // thread
    };

  private:
    // CLASS DATA
    static unsigned int       s_antiOptimization; // Used by 'busyWork' to
                                                  // prevent optimization.

    // DATA
    bsl::vector<ThreadGroup>  d_threadGroups;     // Data kept for each thread
                                                  // group added.

    bsls::AtomicInt           d_state;            // This is how a test thread
                                                  // knows it has to exit.  It
                                                  // starts as 0, and exits
                                                  // when is set to 1.

    // FRIENDS
    friend class ThroughputBenchmark_WorkFunction;
    friend class ThroughputBenchmark_TestUtil;

    // NOT IMPLEMENTED
    ThroughputBenchmark(const ThroughputBenchmark&);
    ThroughputBenchmark& operator=(const ThroughputBenchmark&);

    // PRIVATE ACCESSORS
    bool isRunState() const;
        // Return 'true' if the test should continue to run, and 'false'
        // otherwise.

  public:
    // TRAITS
    BSLMF_NESTED_TRAIT_DECLARATION(ThroughputBenchmark,
                                   bslma::UsesBslmaAllocator);

    // CLASS METHODS
    static unsigned int antiOptimization();
        // Return the value calculated by 'busyWork'.  Note that this method is
        // provided to prevent the compiler from optimizing the simulated
        // workload away.

    static void busyWork(bsls::Types::Int64 busyWorkAmount);
        // Perform arithmetic operations to consume an amount of time in linear
        // relation to the specified 'busyWorkAmount'.  Note that the duration
        // of 'busyWork' invoked with a particular 'busyWorkAmount' will vary
        // with system load.

    static bsls::Types::Int64 estimateBusyWorkAmount(
                                                  bsls::TimeInterval duration);
        // Return an estimate of the work amount so that 'busyWork' invoked
        // with the returned work amount executes, approximately, for the
        // specified 'duration'.  Note that this estimate varies with system
        // load.

    // CREATORS
    explicit ThroughputBenchmark(bslma::Allocator *basicAllocator = 0);
        // Create an empty 'ThroughputBenchmark' object.  Optionally specify a
        // 'basicAllocator' used to supply memory.  If 'basicAllocator' is 0,
        // the currently installed default allocator is used.

    // MANIPULATORS
    int addThreadGroup(const RunFunction& runFunction,
                       int                numThreads,
                       bsls::Types::Int64 busyWorkAmount);
    int addThreadGroup(const RunFunction&              runFunction,
                       int                             numThreads,
                       bsls::Types::Int64              busyWorkAmount,
                       const InitializeThreadFunction& initializeFunctor,
                       const CleanupThreadFunction&    cleanupFunctor);
        // Create a set of threads, with cardinality the specified
        // 'numThreads', that will repeatedly execute the specified
        // 'runFunction' followed by the specified 'busyWork', with the
        // specified 'busyWorkAmount' as its argument.  Return the index for
        // the thread group.  Optionally specify 'initializeFunctor', which is
        // run at the beginning of each thread of the sample and accepts a
        // boolean flag 'isFirst', that is set to 'true' on the first sample,
        // and 'false' otherwise.  Optionally specify 'cleanupFunctor', which
        // is run at the end of each thread of the sample and accepts a boolean
        // flag 'isLast', that is set to 'true' on the last sample, and 'false'
        // otherwise.  Return an id for the added thread group.  The behavior
        // is undefined unless '0 < numThreads' and '0 <= busyWorkAmount'.

    void execute(ThroughputBenchmarkResult       *result,
                 int                              millisecondsPerSample,
                 int                              numSamples);
    void execute(ThroughputBenchmarkResult       *result,
                 int                              millisecondsPerSample,
                 int                              numSamples,
                 const InitializeSampleFunction&  initializeFunctor,
                 const ShutdownSampleFunction&    shutdownFunctor,
                 const CleanupSampleFunction&     cleanupFunctor);
        // Run the tests previously added with calls to the 'addThreadGroup'
        // method.  The tests are run for the specified 'numSamples' times.
        // Each sample is run for the specified 'millisecondsPerSample'
        // duration.  The results are stored in the specified 'result' object.
        // Optionally specify 'initializeFunctor', which is run at the
        // beginning of the sample and accepts a boolean flag 'isFirst', that
        // is set to 'true' on the first sample, and 'false' otherwise.
        // Optionally specify 'shutdownFunctor', which is run at the end of
        // each sample before threads have been joined, and accepts a boolean
        // flag 'isLast', that is set to 'true' on the last sample, and 'false'
        // otherwise.  Optionally specify 'cleanupFunctor', which is run at the
        // end of each sample after threads have been joined, and accepts a
        // boolean flag 'isLast', that is set to 'true' on the last sample, and
        // 'false' otherwise.  The behavior is undefined unless
        // '0 < millisecondsPerSample', '0 < numSamples', and
        // '0 < numThreadGroups()'.  Also see {Structure of a Test}.

    // ACCESSORS
    int numThreads() const;
        // Return the total number of threads.

    int numThreadGroups() const;
        // Return the number of thread groups.

    int numThreadsInGroup(int threadGroupIndex) const;
        // Return the number of threads in the specified 'threadGroupIndex'.
        // The behavior is undefined unless
        // '0 <= threadGroupIndex < numThreadGroups()'.

                                  // Aspects

    bslma::Allocator *allocator() const;
        // Return the allocator used by this object.

};

                   // ===================================
                   // struct ThroughputBenchmark_WorkData
                   // ===================================

struct ThroughputBenchmark_WorkData {
    // Data transferred to ThroughputBenchmark_WorkFunction.

    // PUBLIC DATA
    ThroughputBenchmark::RunFunction              d_func;
                                                    // test function to run

    bsls::Types::Int64                            d_amount;
                                                    // busy work amount

    ThroughputBenchmark::InitializeThreadFunction d_initialize;
                                                    // initialize function per
                                                    // thread

    ThroughputBenchmark::CleanupThreadFunction    d_cleanup;
                                                    // cleanup function per
                                                    // thread

    ThroughputBenchmark                          *d_bench_p;
                                                    // exposes the "this"
                                                    // pointer of the benchmark
                                                    // to the work thread

    int                                           d_threadIndex;
                                                    // thread index 0, 1, 2,
                                                    // ... that is provided to
                                                    // the thread to
                                                    // differentiate it if so
                                                    // desired

    bslmt::Barrier                               *d_barrier_p;
                                                    // trigger start for
                                                    // threads to start
                                                    // processing at the same
                                                    // time

    bsls::Types::Int64                            d_actualNanos;
                                                    // number of nanoseconds
                                                    // that the thread actually
                                                    // ran

    bsls::Types::Int64                            d_count;
                                                    // number of items
                                                    // processed by this thread
};

                  // ======================================
                  // class ThroughputBenchmark_WorkFunction
                  // ======================================

class ThroughputBenchmark_WorkFunction {
    // This class is the work function functor, being called for each work
    // thread.

  private:
    // DATA
    ThroughputBenchmark_WorkData& d_data;

  public:
    // CREATORS
    explicit ThroughputBenchmark_WorkFunction(
                                           ThroughputBenchmark_WorkData& data);
        // Create a 'ThroughputBenchmark_WorkFunction' object with the
        // specified 'data' argument.

    //! ~ThroughputBenchmark_WorkFunction() = default;
        // Destroy this object.

    // MANIPULATORS
    void operator()();
        // Work function being run on the thread.
};

                    // ==================================
                    // class ThroughputBenchmark_TestUtil
                    // ==================================

class ThroughputBenchmark_TestUtil {
    // This class implements a test utility that gives the test driver access
    // to the unexposed data members of 'ThroughputBenchmark'.

    // DATA
    ThroughputBenchmark& d_data;

  public:
    // CREATORS
    explicit ThroughputBenchmark_TestUtil(ThroughputBenchmark& data);
        // Create a 'ThroughputBenchmark_TestUtil' object to test contents of
        // the specified 'data'.

    //! ~ThroughputBenchmark_TestUtil() = default;
        // Destroy this object.

    // MANIPULATORS
    bsls::AtomicInt& state();
        // Return a reference providing modifiable access to the 'd_state' data
        // member of 'ThroughputBenchmark'.

    bsl::vector<ThroughputBenchmark::ThreadGroup>& threadGroups();
        // Return a reference providing modifiable access to the
        // 'd_threadGroups' data member of 'ThroughputBenchmark'.
};

// ============================================================================
//                             INLINE DEFINITIONS
// ============================================================================

                        // -------------------------
                        // class ThroughputBenchmark
                        // -------------------------

// PRIVATE ACCESSORS
inline
bool ThroughputBenchmark::isRunState() const
{
    return d_state.loadAcquire() == 0;
}

// ACCESSORS
inline
int ThroughputBenchmark::numThreads() const
{
    if (0 == d_threadGroups.size()) {
        return 0;                                                     // RETURN
    }

    int numThreads = d_threadGroups[0].d_numThreads;
    for (int i = 1; i < numThreadGroups(); ++i) {
        numThreads += d_threadGroups[i].d_numThreads;
    }
    return numThreads;
}

inline
int ThroughputBenchmark::numThreadGroups() const
{
    return static_cast<int>(d_threadGroups.size());
}

inline
int ThroughputBenchmark::numThreadsInGroup(int threadGroupIndex) const
{
    BSLS_ASSERT(0                 <= threadGroupIndex);
    BSLS_ASSERT(numThreadGroups() >  threadGroupIndex);

    return d_threadGroups[threadGroupIndex].d_numThreads;
}

                                  // Aspects

inline
bslma::Allocator* ThroughputBenchmark::allocator() const
{
    return d_threadGroups.get_allocator().mechanism();
}

                  // --------------------------------------
                  // class ThroughputBenchmark_WorkFunction
                  // --------------------------------------

// CREATORS
inline
ThroughputBenchmark_WorkFunction::ThroughputBenchmark_WorkFunction(
                                            ThroughputBenchmark_WorkData& data)
: d_data(data)
{
}

                    // ----------------------------------
                    // class ThroughputBenchmark_TestUtil
                    // ----------------------------------

// CREATORS
inline
ThroughputBenchmark_TestUtil::ThroughputBenchmark_TestUtil(
                                                     ThroughputBenchmark& data)
: d_data(data)
{
}

// MANIPULATORS
inline
bsls::AtomicInt& ThroughputBenchmark_TestUtil::state()
{
    return d_data.d_state;
}

inline
bsl::vector<ThroughputBenchmark::ThreadGroup>&
                                   ThroughputBenchmark_TestUtil::threadGroups()
{
    return d_data.d_threadGroups;
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2019 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------