// bsls_performancehint.h -*-C++-*- #ifndef INCLUDED_BSLS_PERFORMANCEHINT #define INCLUDED_BSLS_PERFORMANCEHINT #include <bsls_ident.h> BSLS_IDENT("$Id: $") //@PURPOSE: Provide performance hints for code optimization. // //@CLASSES: // bsls::PerformanceHint: namespace for performance optimization hints // //@MACROS: // BSLS_PERFORMANCEHINT_PREDICT_LIKELY(X): 'X' probably evaluates to non-zero // BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(X): 'X' probably evaluates to zero // BSLS_PERFORMANCEHINT_PREDICT_EXPECT(X, Y): 'X' probably evaluates to 'Y' // BSLS_PERFORMANCEHINT_UNLIKELY_HINT: annotate block unlikely to be taken // BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE: prevent compiler optimizations // //@DESCRIPTION: This component provides performance hints for the compiler or // hardware. There are currently two types of hints that are supported: //: o branch prediction //: o data cache prefetching // ///Branch Prediction ///----------------- // The three macros provided, 'BSLS_PERFORMANCEHINT_PREDICT_LIKELY', // 'BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY', and // 'BSLS_PERFORMANCEHINT_PREDICT_EXPECT', can be used to optimize compiler // generated code for branch prediction. The compiler, when given the hint // under *optimized* mode (i.e., with 'BDE_BUILD_TARGET_OPT' defined) will // rearrange the assembly instructions it generates to minimize the number of // jumps needed. // // The following describes the macros provided by this component: //.. // Macro Name Description of Macro // ---------------------------------------- ----------------------------- // BSLS_PERFORMANCEHINT_PREDICT_LIKELY(expr) Hint to the compiler that the // specified *integral* 'expr' // expression is likely to // evaluate to non-zero. // Returns 'true' or 'false' // depending on the result of // the expression. // // BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(expr) Hint to the compiler that the // specified *integral* 'expr' // expression is likely to // evaluate to zero. Returns // 'true' or 'false' depending // on the result of the // expression. // // BSLS_PERFORMANCEHINT_PREDICT_EXPECT(expr, value) // Hint to the compiler that the // specified *integral* 'expr' // expression is likely to // evaluate to the specified // 'value'. Returns the result // of the expression. // // BSLS_PERFORMANCEHINT_UNLIKELY_HINT Hint to the compiler that the // block which contains the hint // is unlikely chosen. Use this // in conjunction with the // 'PREDICT_UNLIKELY' clause for // maximum portability. //.. // ///Warning ///- - - - // Please use the macros provided in this component *with* *caution*. Always // profile your code to get an idea of actual usage before attempting to // optimize with these macros. Furthermore, these macros are merely *hints* to // the compiler. Whether or not they will have visible effect on performance // is not guaranteed. Note that one can perform similar optimization with a // profile-based compilation. When compiled with the proper options, the // compiler can collect usage information of the code, and such information can // then be passed back to recompile the code in a more optimized form. Please // refer to the compiler manual for more information. // ///Limitations ///- - - - - - // There is a bug in gcc 4.2, 4.3, and 4.4 such that when using the branch // prediction macros with multiple conditions, the generated code might not be // properly optimized. For example: //.. // if (BSLS_PERFORMANCEHINT_PREDICT_LIKELY(a && b)) { // // ... // } //.. // The work-around is simply to split the conditions: //.. // if (BSLS_PERFORMANCEHINT_PREDICT_LIKELY(a) // && BSLS_PERFORMANCEHINT_PREDICT_LIKELY(b)) { // // ... // } //.. // This applies to all of the "likely", "unlikely", and "expect" macros defined // in this component. Note that a bug report has been filed: //.. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42233 //.. // ///Data Cache Prefetching ///---------------------- // The two functions provided in the 'bsls::PerformanceHint' 'struct' are // 'prefetchForReading' and 'prefetchForWriting'. Use of these functions will // cause the compiler to generate prefetch instructions to prefetch one cache // line worth of data at the specified address into the cache line to minimize // processor stalls. //.. // Function Name Description of Function // ------------------------ ------------------------------------------ // prefetchForReading(address) Prefetches one cache line worth of data at // the specified 'address' for reading. // // prefetchForWriting(address) Prefetches one cache line worth of data at // the specified 'address' for writing. //.. // ///Warning ///- - - - // These functions must be used *with* *caution*. Inappropriate use of these // functions degrades performance. Note that there should be sufficient time // for the prefetch instruction to finish before the specified address is // accessed, otherwise prefetching will be pointless. A profiler should be // used to understand the program's behavior before attempting to optimize with // these functions. // ///Optimization Fence ///------------------ // The macro 'BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE' prevents some compiler // optimizations, particularly compiler instruction reordering. This fence // does *not* map to a CPU instruction and has no impact on processor // instruction re-ordering, and therefore should not be used to synchronize // memory between threads. The fence may be useful in unusual contexts, like // performing benchmarks, or working around bugs identified in the compiler's // optimizer. // ///Warning ///- - - - // This macro should be used *with* *caution*. The macro will generally // decrease the performance of code on which it is applied, and is not // implemented on all platforms. // ///Usage ///----- // The following series of examples illustrates use of the macros and functions // provided by this component. // ///Example 1: Using the Branch Prediction Macros ///- - - - - - - - - - - - - - - - - - - - - - - // The following demonstrates the use of 'BSLS_PERFORMANCEHINT_PREDICT_LIKELY' // and 'BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY' to generate more efficient // assembly instructions. Note the use of 'BSLS_PERFORMANCEHINT_UNLIKELY_HINT' // inside the 'if' branch for maximum portability. //.. // volatile int global; // // void foo() // { // global = 1; // } // // void bar() // { // global = 2; // } // // int main(int argc, char **argv) // { // argc = std::atoi(argv[1]); // // for (int x = 0; x < argc; ++x) { // int y = std::rand() % 10; // // // Correct usage of 'BSLS_PERFORMANCEHINT_PREDICT_LIKELY' since // // there are nine of ten chance that this branch is taken. // // if (BSLS_PERFORMANCEHINT_PREDICT_LIKELY(8 != y)) { // foo(); // } // else { // BSLS_PERFORMANCEHINT_UNLIKELY_HINT; // bar(); // } // } // return 0; // } //.. // An excerpt of the assembly code generated using 'xlC' Version 10 on AIX from // this small program is: //.. // b8: 2c 00 00 08 cmpwi r0,8 // bc: 41 82 00 38 beq- f4 <.main+0xb4> // ^ // Note that if register r0 (y) equals 8, branch to // instruction f4 (a jump). The '-' after 'beq' // indicates that the branch is unlikely to be taken. // The predicted code path continues the 'if' // statement, which calls 'foo' below. // // c0: 4b ff ff 41 bl 0 <.foo__Fv> // ... // f4: 4b ff ff 2d bl 20 <.bar__Fv> //.. // Now, if 'BSLS_PERFORMANCEHINT_PREDICT_LIKELY' is changed to // 'BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY', and the // 'BSLS_PERFORMANCEHINT_UNLIKELY_HINT' is moved to the first branch, the // following assembly code will be generated: //.. // b8: 2c 00 00 08 cmpwi r0,8 // bc: 40 c2 00 38 bne- f4 <.main+0xb4> // ^ // Note that the test became a "branch not equal" // test. The predicted code path now continues to the // 'else' statement, which calls 'bar' below. // // c0: 4b ff ff 61 bl 20 <.bar__Fv> // ... // f4: 4b ff ff 0d bl 0 <.foo__Fv> //.. // A timing analysis shows that effective use of branch prediction can have a // material effect on code efficiency: //.. // $time ./unlikely.out 100000000 // // real 0m2.022s // user 0m2.010s // sys 0m0.013s // // $time ./likely.out 100000000 // // real 0m2.159s // user 0m2.149s // sys 0m0.005s //.. // ///Example 2: Using 'BSLS_PERFORMANCEHINT_PREDICT_EXPECT' /// - - - - - - - - - - - - - - - - - - - - - - - - - - - // This macro is essentially the same as the '__builtin_expect(expr, value)' // macro that is provided by some compilers. This macro allows the user to // define more complex hints to the compiler, such as the optimization of // 'switch' statements. For example, given: //.. // int x = std::rand() % 4; //.. // the following is incorrect usage of 'BSLS_PERFORMANCEHINT_PREDICT_EXPECT', // since the probability of getting a 3 is equivalent to the other // possibilities ( 0, 1, 2 ): //.. // switch (BSLS_PERFORMANCEHINT_PREDICT_EXPECT(x, 3)) { // case 1: //.. // break; // case 2: //.. // break; // case 3: //.. // break; // default: break; // } //.. // However, this is sufficient to illustrate the intent of this macro. // ///Example 3: Cache Line Prefetching ///- - - - - - - - - - - - - - - - - // The following demonstrates use of 'prefetchForReading' and // 'prefetchForWriting' to prefetch data cache lines: //.. // const int SIZE = 10 * 1024 * 1024; // // void add(int *arrayA, int *arrayB) // { // for (int i = 0; i < SIZE / 8; ++i){ // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // } // } // // int array1[SIZE]; // int array2[SIZE]; // // int main() // { // BloombergLP::bsls::Stopwatch timer; // timer.start(); // for (int i = 0; i < 10; ++i) { // add(array1, array2); // } // printf("time: %f\n", timer.elapsedTime()); // return 0; // } //.. // The above code simply adds two arrays together multiple times. Using // 'bsls::Stopwatch', we recorded the running time and printed it to 'stdout': //.. // $./prefetch.sundev1.tsk // time: 8.446806 //.. // Now, we can observe that in the 'add' function, 'arrayA' and 'arrayB' are // accessed sequentially for the majority of the program. 'arrayA' is used for // writing and 'arrayB' is used for reading. Making use of prefetch, we add // calls to 'prefetchForReading' and 'prefetchForWriting': //.. // void add2(int *arrayA, int *arrayB) // { // for (int i = 0; i < SIZE / 8; ++i){ // using namespace BloombergLP; // Generally avoid 'using' in this TD. // bsls::PerformanceHint::prefetchForWriting((int *) arrayA + 16); // bsls::PerformanceHint::prefetchForReading((int *) arrayB + 16); // // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // *arrayA += *arrayB; ++arrayA; ++arrayB; // } // } //.. // Adding the prefetch improves the program's efficiency: //.. // $./prefetch.sundev1.tsk // time: 6.442100 //.. // Note that we prefetch the address '16 * sizeof(int)' bytes away from // 'arrayA'. This is such that the prefetch instruction has sufficient time to // finish before the data is actually accessed. To see the difference, if we // changed '+ 16' to '+ 4': //.. // $./prefetch.sundev1.tsk // time: 6.835928 //.. // And we get less of an improvement in speed. Similarly, if we prefetch too // far away from the data use, the data might be removed from the cache before // it is looked at and the prefetch is wasted. #include <bsls_platform.h> #if defined(BSLS_PLATFORM_CMP_IBM) #include <builtins.h> // for 'dcbt', '__builtin_expect' #endif #if defined(BSLS_PLATFORM_CMP_HP) #include <machine/sys/builtins.h> #include <machine/sys/inline.h> #endif #if defined(BSLS_PLATFORM_CMP_SUN) #include <sun_prefetch.h> // for 'sparc_prefetch_write|read_many' #include <mbarrier.h> #endif #if defined(BSLS_PLATFORM_CMP_MSVC) #include <xmmintrin.h> // for '_mm_prefetch', '_MM_HINT_T0' #include <intrin.h> #endif namespace BloombergLP { // ============================ // BSLS_PERFORMANCEHINT_PREDICT // ============================ // These macros are effective in *optimized* mode only, and *only* on platforms // that support '__builtin_expect'. #if defined(BDE_BUILD_TARGET_OPT) && \ (defined(BSLS_PLATFORM_CMP_CLANG) || \ defined(BSLS_PLATFORM_CMP_GNU) || \ defined(BSLS_PLATFORM_CMP_IBM)) #define BSLS_PERFORMANCEHINT_PREDICT_LIKELY(expr) \ __builtin_expect(!!(expr), 1) #define BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(expr) \ __builtin_expect(!!(expr), 0) #define BSLS_PERFORMANCEHINT_PREDICT_EXPECT(expr, value) \ __builtin_expect((expr), (value)) #else #define BSLS_PERFORMANCEHINT_PREDICT_LIKELY(expr) (expr) #define BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(expr) (expr) #define BSLS_PERFORMANCEHINT_PREDICT_EXPECT(expr, value) (expr) #endif // Define the 'BSLS_PERFORMANCEHINT_HAS_ATTRIBUTE_COLD' and // 'BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD' macros. #if defined(BSLS_PLATFORM_CMP_CLANG) #if __has_attribute(cold) #define BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD __attribute__((cold)) #endif #elif defined(BSLS_PLATFORM_CMP_GNU) #define BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD __attribute__((cold)) #endif #if !defined(BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD) #define BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD #else #define BSLS_PERFORMANCEHINT_HAS_ATTRIBUTE_COLD 1 #endif // Define the 'BSLS_PERFORMANCEHINT_UNLIKELY_HINT' macro. #if defined(BDE_BUILD_TARGET_OPT) && defined(BSLS_PLATFORM_CMP_SUN) #define BSLS_PERFORMANCEHINT_UNLIKELY_HINT \ BloombergLP::bsls::PerformanceHint::rarelyCalled() #elif defined(BDE_BUILD_TARGET_OPT) && \ (defined(BSLS_PLATFORM_CMP_IBM) || BSLS_PERFORMANCEHINT_HAS_ATTRIBUTE_COLD) #define BSLS_PERFORMANCEHINT_UNLIKELY_HINT \ BloombergLP::bsls::PerformanceHint::lowFrequency() #else #define BSLS_PERFORMANCEHINT_UNLIKELY_HINT #endif // ======================================= // BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE // ======================================= #if defined(BSLS_PLATFORM_CMP_IBM) #define BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE __fence() #elif defined(BSLS_PLATFORM_CMP_MSVC) #pragma intrinsic(_ReadWriteBarrier) #define BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE _ReadWriteBarrier() #elif defined(BSLS_PLATFORM_CMP_HP) #define BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE \ _Asm_sched_fence(_UP_MEM_FENCE|_DOWN_MEM_FENCE) #elif defined(BSLS_PLATFORM_CMP_SUN) #define BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE __compiler_barrier() #elif defined(BSLS_PLATFORM_CMP_GNU) \ || defined(BSLS_PLATFORM_CMP_CLANG) #define BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE asm volatile("":::"memory") #else #error "BSLS_PERFORMANCEHINT_OPTIMIZATION_FENCE not implemented" #endif namespace bsls { // ====================== // struct PerformanceHint // ====================== struct PerformanceHint { // This 'struct' provides a namespace for a suite of functions that give // performance hints to the compiler or hardware. // CLASS METHODS static void prefetchForReading(const void *address); // Prefetch one cache line worth of data at the specified 'address' for // reading if the compiler built-in is available (see to the component // level document for limitations). Otherwise this method has no // effect. static void prefetchForWriting(void *address); // Prefetch one cache line worth of data at the specified 'address' for // writing if the compiler built-in is available (see to the component // level document for limitations). Otherwise this method has no // effect. static void rarelyCalled(); // This is an empty function that is marked as rarely called using // pragmas. If this function is placed in a block of code inside a // branch, the compiler will optimize the assembly code generated and // mark the block as unlikely. Note that this function is // intentionally not inlined. #if defined(BDE_BUILD_TARGET_OPT) #if defined(BSLS_PLATFORM_CMP_SUN) // Pragma to flag the function as rarely called. #pragma rarely_called(rarelyCalled) // Pragma to flag the function as no side effect. This is necessary because a // function that is marked as rarely called cannot be inlined without losing // the 'rarely_called' characteristics. When marked as no side effect, even an // out-of-line function will not trigger a function call. #pragma no_side_effect(rarelyCalled) #endif // BSLS_PLATFORM_CMP_SUN #endif // BDE_BUILD_TARGET_OPT BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD static void lowFrequency(); // This is an empty function that is marked with low execution // frequency using pragmas. If this function is placed in a block of // code inside a branch, the compiler will optimize the assembly code // generated and mark the block as unlikely. }; // ============================================================================ // INLINE DEFINITIONS // ============================================================================ // ---------------------- // struct PerformanceHint // ---------------------- // CLASS METHODS inline void PerformanceHint::prefetchForReading(const void *address) { #if defined(BSLS_PLATFORM_CMP_GNU) || defined(BSLS_PLATFORM_CMP_CLANG) __builtin_prefetch(address, 0); #elif defined(BSLS_PLATFORM_CMP_IBM) __dcbt(const_cast<void *>(address)); #elif defined(BSLS_PLATFORM_CMP_SUN) sparc_prefetch_read_many(const_cast<void *>(address)); #elif defined(BSLS_PLATFORM_CMP_MSVC) _mm_prefetch(static_cast<const char*>(address), _MM_HINT_T0); // '_MM_HINT_T0' fetches data to all levels of cache. #elif defined(BSLS_PLATFORM_CMP_HP) _Asm_lfetch(_LFTYPE_NONE, _LFHINT_NTA, address); #else // no-op #endif } inline void PerformanceHint::prefetchForWriting(void *address) { #if defined(BSLS_PLATFORM_CMP_GNU) || defined(BSLS_PLATFORM_CMP_CLANG) __builtin_prefetch(address, 1); #elif defined(BSLS_PLATFORM_CMP_IBM) __dcbtst(address); #elif defined(BSLS_PLATFORM_CMP_SUN) sparc_prefetch_write_many(address); #elif defined(BSLS_PLATFORM_CMP_MSVC) _mm_prefetch(static_cast<const char*>(address), _MM_HINT_T0); // '_MM_HINT_T0' fetches data to all levels of cache. #elif defined(BSLS_PLATFORM_CMP_HP) _Asm_lfetch_excl(_LFTYPE_NONE, _LFHINT_NTA, address); #else // no-op #endif } // This function must be inlined for the pragma to take effect on the branch // prediction in IBM xlC. BSLS_PERFORMANCEHINT_ATTRIBUTE_COLD inline void PerformanceHint::lowFrequency() { #if defined(BDE_BUILD_TARGET_OPT) && defined(BSLS_PLATFORM_CMP_IBM) #pragma execution_frequency(very_low) #endif } } // close package namespace #ifndef BDE_OPENSOURCE_PUBLICATION // BACKWARD_COMPATIBILITY // ============================================================================ // BACKWARD COMPATIBILITY // ============================================================================ typedef bsls::PerformanceHint bsls_PerformanceHint; // This alias is defined for backward compatibility. #endif // BDE_OPENSOURCE_PUBLICATION -- BACKWARD_COMPATIBILITY } // close enterprise namespace #endif // ---------------------------------------------------------------------------- // Copyright 2013 Bloomberg Finance L.P. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ----------------------------- END-OF-FILE ----------------------------------